Separate initialization from loading. Prefer upserts.

pull/9/head
Joshua Potter 2023-12-04 08:14:33 -07:00
parent 9a5274f0dd
commit 1c0dc05b42
3 changed files with 61 additions and 21 deletions

View File

@ -65,16 +65,18 @@ $ pg_ctl -D db stop
### Loading Data
To load all exported coach data into a local postgres instance, use the provided
`sql/load_export.sql` file. First concatenate all exported content:
To load all exported coach data into a local Postgres instance, use the provided
`sql/*.sql` files. First initialize the export schema/table:
```bash
$ psql -h @scraper -f sql/init.sql
```
Next, concatenate all exported content and dump into the newly created table:
```bash
$ cat data/{chesscom,lichess}/export.json > data/export.json
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
```
Then (assuming your database cluster has been initialized at `@scraper`), you
can run:
```bash
$ psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
```
Re-running will automatically create backups and replace the coach data found
in `coach_scraper.export`.
### E2E
@ -85,7 +87,8 @@ connection available at `@scraper`:
```bash
nix run . -- --user-agent <your-email> -s chesscom -s lichess
cat data/{chesscom,lichess}/export.json > data/export.json
psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
psql -h @scraper -f sql/init.sql
psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
```
## Development

View File

@ -1,23 +1,12 @@
CREATE SCHEMA IF NOT EXISTS coach_scraper;
DO $$
BEGIN
EXECUTE format(
'ALTER TABLE IF EXISTS coach_scraper.export '
'RENAME TO export_%s;',
'CREATE TABLE coach_scraper.export_%s AS TABLE coach_scraper.export',
TRUNC(EXTRACT(EPOCH FROM CURRENT_TIMESTAMP), 0)
);
END;
$$ LANGUAGE plpgsql;
CREATE TABLE coach_scraper.export
( username VARCHAR(255) NOT NULL
, site VARCHAR(16) NOT NULL
, rapid INT
, blitz INT
, bullet INT
);
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
SELECT format(
@ -26,6 +15,12 @@ SELECT format(
) \gexec
INSERT INTO coach_scraper.export
( username
, site
, rapid
, blitz
, bullet
)
SELECT
data->>'username',
data->>'site',
@ -33,4 +28,10 @@ SELECT
(data->>'blitz')::INT,
(data->>'bullet')::INT
FROM
pg_temp.coach_scraper_export;
pg_temp.coach_scraper_export
ON CONFLICT
(site, username)
DO UPDATE SET
rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz,
bullet = EXCLUDED.bullet;

36
sql/init.sql Normal file
View File

@ -0,0 +1,36 @@
CREATE SCHEMA IF NOT EXISTS coach_scraper;
CREATE TABLE IF NOT EXISTS coach_scraper.export
( id SERIAL PRIMARY KEY
, username VARCHAR(255) NOT NULL
, site VARCHAR(16) NOT NULL
, rapid INT
, blitz INT
, bullet INT
);
CREATE UNIQUE INDEX IF NOT EXISTS
site_username_unique
ON
coach_scraper.export
USING
BTREE (site, username);
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM information_schema.constraint_column_usage
WHERE table_schema = 'coach_scraper'
AND table_name = 'export'
AND constraint_name = 'site_username_unique'
) THEN
EXECUTE 'ALTER TABLE
coach_scraper.export
ADD CONSTRAINT
site_username_unique
UNIQUE USING INDEX
site_username_unique';
END IF;
END;
$$ LANGUAGE plpgsql;