Rewrite export as NDJSON and include script to load result into postgres. (#3)

* Allow loading exported data into database.

* Explanation on E2E.
pull/9/head
Joshua Potter 2023-12-01 10:30:44 -07:00 committed by GitHub
parent 9b81105a5e
commit 0c4e008b45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 111 additions and 20 deletions

9
.gitignore vendored
View File

@ -2,10 +2,13 @@
__pycache__/ __pycache__/
# Directory used by `direnv` to hold `use flake`-generated profiles. # Directory used by `direnv` to hold `use flake`-generated profiles.
.direnv/ /.direnv/
# A symlink produced by default when running `nix build`. # A symlink produced by default when running `nix build`.
result /result
# The results of the `chess.com` coach scraping. # The results of the `chess.com` coach scraping.
data/ /data/
# The local database used when testing exports.
/db/

View File

@ -33,12 +33,60 @@ data
If you have nix available, run: If you have nix available, run:
```bash ```bash
$> nix run . -- --user-agent <your-email> -s chesscom $ nix run . -- --user-agent <your-email> -s chesscom
``` ```
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
instead run the following: instead run the following:
```bash ```bash
$> poetry run python3 -m app -u <your-email> -s chesscom $ poetry run python3 -m app -u <your-email> -s chesscom
```
## Database
Included in the development shell of this flake is a [Postgres](https://www.postgresql.org/)
client (version 15.5). Generate an empty Postgres cluster at `/db` by running
```bash
$ pg_ctl -D db init
```
To start the database, run the following:
```bash
$ pg_ctl -D db -l db/logfile -o --unix_socket_directories=@scraper start
```
In the above command, `@scraper` refers to an [abstract socket name](https://www.postgresql.org/docs/15/runtime-config-connection.html#GUC-UNIX-SOCKET-DIRECTORIES).
Rename to whatever is appropriate for your use case. To then connect to this
database instance, run:
```bash
$ psql -h @scraper
```
To later shut the database down, run:
```bash
$ pg_ctl -D db stop
```
### Loading Data
To load all exported coach data into a local postgres instance, use the provided
`sql/load_export.sql` file. First concatenate all exported content:
```bash
$ cat data/{chesscom,lichess}/export.json > data/export.json
```
Then (assuming your database cluster has been initialized at `@scraper`), you
can run:
```bash
$ psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
```
### E2E
With the above section on loading files, we now have the individual components
necessary to scrape coach data from our chess website and dump the results into
the database in one fell swoop. Assuming our database is open with a socket
connection available at `@scraper`:
```bash
nix run . -- --user-agent <your-email> -s chesscom
nix run . -- --user-agent <your-email> -s lichess
cat data/{chesscom,lichess}/export.json > data/export.json
psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
``` ```
## Development ## Development
@ -49,7 +97,7 @@ dependency management handled by poetry (version 1.7.0). [direnv](https://direnv
can be used to a launch a dev shell upon entering this directory (refer to can be used to a launch a dev shell upon entering this directory (refer to
`.envrc`). Otherwise run via: `.envrc`). Otherwise run via:
```bash ```bash
$> nix develop $ nix develop
``` ```
### Language Server ### Language Server
@ -67,7 +115,7 @@ Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.h
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can (version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
be used to format all `*.py` files prior to commit. Install via: be used to format all `*.py` files prior to commit. Install via:
```bash ```bash
$> git config --local core.hooksPath .githooks/ $ git config --local core.hooksPath .githooks/
``` ```
If running [direnv](https://direnv.net/), this hook is installed automatically If running [direnv](https://direnv.net/), this hook is installed automatically
when entering the directory. when entering the directory.

View File

@ -41,14 +41,15 @@ async def run():
scraper = LichessScraper(session) scraper = LichessScraper(session)
exporter_cls = LichessExporter exporter_cls = LichessExporter
dump = {} # Write out each coach data into NDJSON file.
dump = []
usernames = await scraper.scrape() usernames = await scraper.scrape()
for username in usernames: for username in usernames:
dump[username] = exporter_cls(username).export() export = exporter_cls(username).export()
dump.append(f"{json.dumps(export)}\n")
with open(scraper.path_site_file("export.json"), "w") as f: with open(scraper.path_site_file("export.json"), "w") as f:
json.dump(dump, f, indent=2) f.writelines(dump)
def main(): def main():

View File

@ -36,6 +36,8 @@ class BaseExporter(Repo):
"""Transform coach-specific data into uniform format.""" """Transform coach-specific data into uniform format."""
export: Export = {} export: Export = {}
_insert(export, "site", self.site)
_insert(export, "username", self.username)
_insert(export, "rapid", self.export_rapid()) _insert(export, "rapid", self.export_rapid())
_insert(export, "blitz", self.export_blitz()) _insert(export, "blitz", self.export_blitz())
_insert(export, "bullet", self.export_bullet()) _insert(export, "bullet", self.export_bullet())

View File

@ -209,4 +209,7 @@ class Exporter(BaseExporter):
if value[-1] == "?": if value[-1] == "?":
value = value[:-1] value = value[:-1]
return value try:
return int(value)
except ValueError:
return None

View File

@ -1,12 +1,9 @@
{ {
description = '' description = ''
An opinionated poetry flake. A web scraper for chess coaches.
This flake has been adapted from the `app` template found in: To generate a copy of this template elsewhere, refer to:
https://github.com/nix-community/poetry2nix https://github.com/jrpotter/bootstrap
To generate a copy of this template elsewhere, run:
$> bootstrap poetry
''; '';
inputs = { inputs = {
@ -78,8 +75,9 @@
}; };
devShells.default = pkgs.mkShell { devShells.default = pkgs.mkShell {
packages = [ packages = with pkgs; [
pkgs.poetry poetry
postgresql_15
] ++ (with pkgs.python311Packages; [ ] ++ (with pkgs.python311Packages; [
black black
debugpy debugpy

36
sql/load_export.sql Normal file
View File

@ -0,0 +1,36 @@
CREATE SCHEMA IF NOT EXISTS coach_scraper;
DO $$
BEGIN
EXECUTE format(
'ALTER TABLE IF EXISTS coach_scraper.export '
'RENAME TO export_%s;',
TRUNC(EXTRACT(EPOCH FROM CURRENT_TIMESTAMP), 0)
);
END;
$$ LANGUAGE plpgsql;
CREATE TABLE coach_scraper.export
( username VARCHAR(255) NOT NULL
, site VARCHAR(16) NOT NULL
, rapid INT
, blitz INT
, bullet INT
);
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
SELECT format(
$$COPY pg_temp.coach_scraper_export (data) from %L$$,
:export
) \gexec
INSERT INTO coach_scraper.export
SELECT
data->>'username',
data->>'site',
(data->>'rapid')::INT,
(data->>'blitz')::INT,
(data->>'bullet')::INT
FROM
pg_temp.coach_scraper_export;