From 0c4e008b456a9e2b789831a1f8801bb21c09f593 Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Fri, 1 Dec 2023 10:30:44 -0700 Subject: [PATCH] Rewrite export as NDJSON and include script to load result into postgres. (#3) * Allow loading exported data into database. * Explanation on E2E. --- .gitignore | 9 +++++--- README.md | 56 +++++++++++++++++++++++++++++++++++++++++---- app/__main__.py | 9 ++++---- app/exporter.py | 2 ++ app/lichess.py | 5 +++- flake.nix | 14 +++++------- sql/load_export.sql | 36 +++++++++++++++++++++++++++++ 7 files changed, 111 insertions(+), 20 deletions(-) create mode 100644 sql/load_export.sql diff --git a/.gitignore b/.gitignore index 4b6e7d5..5316b5b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,10 +2,13 @@ __pycache__/ # Directory used by `direnv` to hold `use flake`-generated profiles. -.direnv/ +/.direnv/ # A symlink produced by default when running `nix build`. -result +/result # The results of the `chess.com` coach scraping. -data/ +/data/ + +# The local database used when testing exports. +/db/ diff --git a/README.md b/README.md index 03ed6e4..35ae80d 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,60 @@ data If you have nix available, run: ```bash -$> nix run . -- --user-agent -s chesscom +$ nix run . -- --user-agent -s chesscom ``` If not, ensure you have [poetry](https://python-poetry.org/) on your machine and instead run the following: ```bash -$> poetry run python3 -m app -u -s chesscom +$ poetry run python3 -m app -u -s chesscom +``` + +## Database + +Included in the development shell of this flake is a [Postgres](https://www.postgresql.org/) +client (version 15.5). Generate an empty Postgres cluster at `/db` by running +```bash +$ pg_ctl -D db init +``` +To start the database, run the following: +```bash +$ pg_ctl -D db -l db/logfile -o --unix_socket_directories=@scraper start +``` +In the above command, `@scraper` refers to an [abstract socket name](https://www.postgresql.org/docs/15/runtime-config-connection.html#GUC-UNIX-SOCKET-DIRECTORIES). +Rename to whatever is appropriate for your use case. To then connect to this +database instance, run: +```bash +$ psql -h @scraper +``` +To later shut the database down, run: +```bash +$ pg_ctl -D db stop +``` + +### Loading Data + +To load all exported coach data into a local postgres instance, use the provided +`sql/load_export.sql` file. First concatenate all exported content: +```bash +$ cat data/{chesscom,lichess}/export.json > data/export.json +``` +Then (assuming your database cluster has been initialized at `@scraper`), you +can run: +```bash +$ psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'" +``` + +### E2E + +With the above section on loading files, we now have the individual components +necessary to scrape coach data from our chess website and dump the results into +the database in one fell swoop. Assuming our database is open with a socket +connection available at `@scraper`: +```bash +nix run . -- --user-agent -s chesscom +nix run . -- --user-agent -s lichess +cat data/{chesscom,lichess}/export.json > data/export.json +psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'" ``` ## Development @@ -49,7 +97,7 @@ dependency management handled by poetry (version 1.7.0). [direnv](https://direnv can be used to a launch a dev shell upon entering this directory (refer to `.envrc`). Otherwise run via: ```bash -$> nix develop +$ nix develop ``` ### Language Server @@ -67,7 +115,7 @@ Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.h (version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can be used to format all `*.py` files prior to commit. Install via: ```bash -$> git config --local core.hooksPath .githooks/ +$ git config --local core.hooksPath .githooks/ ``` If running [direnv](https://direnv.net/), this hook is installed automatically when entering the directory. diff --git a/app/__main__.py b/app/__main__.py index ec45335..ed86bd6 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -41,14 +41,15 @@ async def run(): scraper = LichessScraper(session) exporter_cls = LichessExporter - dump = {} - + # Write out each coach data into NDJSON file. + dump = [] usernames = await scraper.scrape() for username in usernames: - dump[username] = exporter_cls(username).export() + export = exporter_cls(username).export() + dump.append(f"{json.dumps(export)}\n") with open(scraper.path_site_file("export.json"), "w") as f: - json.dump(dump, f, indent=2) + f.writelines(dump) def main(): diff --git a/app/exporter.py b/app/exporter.py index f6df1e1..fe2b160 100644 --- a/app/exporter.py +++ b/app/exporter.py @@ -36,6 +36,8 @@ class BaseExporter(Repo): """Transform coach-specific data into uniform format.""" export: Export = {} + _insert(export, "site", self.site) + _insert(export, "username", self.username) _insert(export, "rapid", self.export_rapid()) _insert(export, "blitz", self.export_blitz()) _insert(export, "bullet", self.export_bullet()) diff --git a/app/lichess.py b/app/lichess.py index f3e6b71..64312c3 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -209,4 +209,7 @@ class Exporter(BaseExporter): if value[-1] == "?": value = value[:-1] - return value + try: + return int(value) + except ValueError: + return None diff --git a/flake.nix b/flake.nix index 56818c6..d3536e7 100644 --- a/flake.nix +++ b/flake.nix @@ -1,12 +1,9 @@ { description = '' - An opinionated poetry flake. + A web scraper for chess coaches. - This flake has been adapted from the `app` template found in: - https://github.com/nix-community/poetry2nix - - To generate a copy of this template elsewhere, run: - $> bootstrap poetry + To generate a copy of this template elsewhere, refer to: + https://github.com/jrpotter/bootstrap ''; inputs = { @@ -78,8 +75,9 @@ }; devShells.default = pkgs.mkShell { - packages = [ - pkgs.poetry + packages = with pkgs; [ + poetry + postgresql_15 ] ++ (with pkgs.python311Packages; [ black debugpy diff --git a/sql/load_export.sql b/sql/load_export.sql new file mode 100644 index 0000000..1394df5 --- /dev/null +++ b/sql/load_export.sql @@ -0,0 +1,36 @@ +CREATE SCHEMA IF NOT EXISTS coach_scraper; + +DO $$ + BEGIN + EXECUTE format( + 'ALTER TABLE IF EXISTS coach_scraper.export ' + 'RENAME TO export_%s;', + TRUNC(EXTRACT(EPOCH FROM CURRENT_TIMESTAMP), 0) + ); + END; +$$ LANGUAGE plpgsql; + +CREATE TABLE coach_scraper.export + ( username VARCHAR(255) NOT NULL + , site VARCHAR(16) NOT NULL + , rapid INT + , blitz INT + , bullet INT + ); + +CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB); + +SELECT format( + $$COPY pg_temp.coach_scraper_export (data) from %L$$, + :export +) \gexec + +INSERT INTO coach_scraper.export +SELECT + data->>'username', + data->>'site', + (data->>'rapid')::INT, + (data->>'blitz')::INT, + (data->>'bullet')::INT +FROM + pg_temp.coach_scraper_export;