Rewrite export as NDJSON and include script to load result into postgres. (#3)
* Allow loading exported data into database. * Explanation on E2E.pull/9/head
parent
9b81105a5e
commit
0c4e008b45
|
@ -2,10 +2,13 @@
|
|||
__pycache__/
|
||||
|
||||
# Directory used by `direnv` to hold `use flake`-generated profiles.
|
||||
.direnv/
|
||||
/.direnv/
|
||||
|
||||
# A symlink produced by default when running `nix build`.
|
||||
result
|
||||
/result
|
||||
|
||||
# The results of the `chess.com` coach scraping.
|
||||
data/
|
||||
/data/
|
||||
|
||||
# The local database used when testing exports.
|
||||
/db/
|
||||
|
|
56
README.md
56
README.md
|
@ -33,12 +33,60 @@ data
|
|||
|
||||
If you have nix available, run:
|
||||
```bash
|
||||
$> nix run . -- --user-agent <your-email> -s chesscom
|
||||
$ nix run . -- --user-agent <your-email> -s chesscom
|
||||
```
|
||||
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
||||
instead run the following:
|
||||
```bash
|
||||
$> poetry run python3 -m app -u <your-email> -s chesscom
|
||||
$ poetry run python3 -m app -u <your-email> -s chesscom
|
||||
```
|
||||
|
||||
## Database
|
||||
|
||||
Included in the development shell of this flake is a [Postgres](https://www.postgresql.org/)
|
||||
client (version 15.5). Generate an empty Postgres cluster at `/db` by running
|
||||
```bash
|
||||
$ pg_ctl -D db init
|
||||
```
|
||||
To start the database, run the following:
|
||||
```bash
|
||||
$ pg_ctl -D db -l db/logfile -o --unix_socket_directories=@scraper start
|
||||
```
|
||||
In the above command, `@scraper` refers to an [abstract socket name](https://www.postgresql.org/docs/15/runtime-config-connection.html#GUC-UNIX-SOCKET-DIRECTORIES).
|
||||
Rename to whatever is appropriate for your use case. To then connect to this
|
||||
database instance, run:
|
||||
```bash
|
||||
$ psql -h @scraper
|
||||
```
|
||||
To later shut the database down, run:
|
||||
```bash
|
||||
$ pg_ctl -D db stop
|
||||
```
|
||||
|
||||
### Loading Data
|
||||
|
||||
To load all exported coach data into a local postgres instance, use the provided
|
||||
`sql/load_export.sql` file. First concatenate all exported content:
|
||||
```bash
|
||||
$ cat data/{chesscom,lichess}/export.json > data/export.json
|
||||
```
|
||||
Then (assuming your database cluster has been initialized at `@scraper`), you
|
||||
can run:
|
||||
```bash
|
||||
$ psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
|
||||
```
|
||||
|
||||
### E2E
|
||||
|
||||
With the above section on loading files, we now have the individual components
|
||||
necessary to scrape coach data from our chess website and dump the results into
|
||||
the database in one fell swoop. Assuming our database is open with a socket
|
||||
connection available at `@scraper`:
|
||||
```bash
|
||||
nix run . -- --user-agent <your-email> -s chesscom
|
||||
nix run . -- --user-agent <your-email> -s lichess
|
||||
cat data/{chesscom,lichess}/export.json > data/export.json
|
||||
psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
|
||||
```
|
||||
|
||||
## Development
|
||||
|
@ -49,7 +97,7 @@ dependency management handled by poetry (version 1.7.0). [direnv](https://direnv
|
|||
can be used to a launch a dev shell upon entering this directory (refer to
|
||||
`.envrc`). Otherwise run via:
|
||||
```bash
|
||||
$> nix develop
|
||||
$ nix develop
|
||||
```
|
||||
|
||||
### Language Server
|
||||
|
@ -67,7 +115,7 @@ Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.h
|
|||
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
|
||||
be used to format all `*.py` files prior to commit. Install via:
|
||||
```bash
|
||||
$> git config --local core.hooksPath .githooks/
|
||||
$ git config --local core.hooksPath .githooks/
|
||||
```
|
||||
If running [direnv](https://direnv.net/), this hook is installed automatically
|
||||
when entering the directory.
|
||||
|
|
|
@ -41,14 +41,15 @@ async def run():
|
|||
scraper = LichessScraper(session)
|
||||
exporter_cls = LichessExporter
|
||||
|
||||
dump = {}
|
||||
|
||||
# Write out each coach data into NDJSON file.
|
||||
dump = []
|
||||
usernames = await scraper.scrape()
|
||||
for username in usernames:
|
||||
dump[username] = exporter_cls(username).export()
|
||||
export = exporter_cls(username).export()
|
||||
dump.append(f"{json.dumps(export)}\n")
|
||||
|
||||
with open(scraper.path_site_file("export.json"), "w") as f:
|
||||
json.dump(dump, f, indent=2)
|
||||
f.writelines(dump)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -36,6 +36,8 @@ class BaseExporter(Repo):
|
|||
"""Transform coach-specific data into uniform format."""
|
||||
export: Export = {}
|
||||
|
||||
_insert(export, "site", self.site)
|
||||
_insert(export, "username", self.username)
|
||||
_insert(export, "rapid", self.export_rapid())
|
||||
_insert(export, "blitz", self.export_blitz())
|
||||
_insert(export, "bullet", self.export_bullet())
|
||||
|
|
|
@ -209,4 +209,7 @@ class Exporter(BaseExporter):
|
|||
if value[-1] == "?":
|
||||
value = value[:-1]
|
||||
|
||||
return value
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return None
|
||||
|
|
14
flake.nix
14
flake.nix
|
@ -1,12 +1,9 @@
|
|||
{
|
||||
description = ''
|
||||
An opinionated poetry flake.
|
||||
A web scraper for chess coaches.
|
||||
|
||||
This flake has been adapted from the `app` template found in:
|
||||
https://github.com/nix-community/poetry2nix
|
||||
|
||||
To generate a copy of this template elsewhere, run:
|
||||
$> bootstrap poetry
|
||||
To generate a copy of this template elsewhere, refer to:
|
||||
https://github.com/jrpotter/bootstrap
|
||||
'';
|
||||
|
||||
inputs = {
|
||||
|
@ -78,8 +75,9 @@
|
|||
};
|
||||
|
||||
devShells.default = pkgs.mkShell {
|
||||
packages = [
|
||||
pkgs.poetry
|
||||
packages = with pkgs; [
|
||||
poetry
|
||||
postgresql_15
|
||||
] ++ (with pkgs.python311Packages; [
|
||||
black
|
||||
debugpy
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
CREATE SCHEMA IF NOT EXISTS coach_scraper;
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
EXECUTE format(
|
||||
'ALTER TABLE IF EXISTS coach_scraper.export '
|
||||
'RENAME TO export_%s;',
|
||||
TRUNC(EXTRACT(EPOCH FROM CURRENT_TIMESTAMP), 0)
|
||||
);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TABLE coach_scraper.export
|
||||
( username VARCHAR(255) NOT NULL
|
||||
, site VARCHAR(16) NOT NULL
|
||||
, rapid INT
|
||||
, blitz INT
|
||||
, bullet INT
|
||||
);
|
||||
|
||||
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
|
||||
|
||||
SELECT format(
|
||||
$$COPY pg_temp.coach_scraper_export (data) from %L$$,
|
||||
:export
|
||||
) \gexec
|
||||
|
||||
INSERT INTO coach_scraper.export
|
||||
SELECT
|
||||
data->>'username',
|
||||
data->>'site',
|
||||
(data->>'rapid')::INT,
|
||||
(data->>'blitz')::INT,
|
||||
(data->>'bullet')::INT
|
||||
FROM
|
||||
pg_temp.coach_scraper_export;
|
Loading…
Reference in New Issue