Rewrite export as NDJSON and include script to load result into postgres. (#3)
* Allow loading exported data into database. * Explanation on E2E.pull/9/head
parent
9b81105a5e
commit
0c4e008b45
|
@ -2,10 +2,13 @@
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
||||||
# Directory used by `direnv` to hold `use flake`-generated profiles.
|
# Directory used by `direnv` to hold `use flake`-generated profiles.
|
||||||
.direnv/
|
/.direnv/
|
||||||
|
|
||||||
# A symlink produced by default when running `nix build`.
|
# A symlink produced by default when running `nix build`.
|
||||||
result
|
/result
|
||||||
|
|
||||||
# The results of the `chess.com` coach scraping.
|
# The results of the `chess.com` coach scraping.
|
||||||
data/
|
/data/
|
||||||
|
|
||||||
|
# The local database used when testing exports.
|
||||||
|
/db/
|
||||||
|
|
56
README.md
56
README.md
|
@ -33,12 +33,60 @@ data
|
||||||
|
|
||||||
If you have nix available, run:
|
If you have nix available, run:
|
||||||
```bash
|
```bash
|
||||||
$> nix run . -- --user-agent <your-email> -s chesscom
|
$ nix run . -- --user-agent <your-email> -s chesscom
|
||||||
```
|
```
|
||||||
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
||||||
instead run the following:
|
instead run the following:
|
||||||
```bash
|
```bash
|
||||||
$> poetry run python3 -m app -u <your-email> -s chesscom
|
$ poetry run python3 -m app -u <your-email> -s chesscom
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database
|
||||||
|
|
||||||
|
Included in the development shell of this flake is a [Postgres](https://www.postgresql.org/)
|
||||||
|
client (version 15.5). Generate an empty Postgres cluster at `/db` by running
|
||||||
|
```bash
|
||||||
|
$ pg_ctl -D db init
|
||||||
|
```
|
||||||
|
To start the database, run the following:
|
||||||
|
```bash
|
||||||
|
$ pg_ctl -D db -l db/logfile -o --unix_socket_directories=@scraper start
|
||||||
|
```
|
||||||
|
In the above command, `@scraper` refers to an [abstract socket name](https://www.postgresql.org/docs/15/runtime-config-connection.html#GUC-UNIX-SOCKET-DIRECTORIES).
|
||||||
|
Rename to whatever is appropriate for your use case. To then connect to this
|
||||||
|
database instance, run:
|
||||||
|
```bash
|
||||||
|
$ psql -h @scraper
|
||||||
|
```
|
||||||
|
To later shut the database down, run:
|
||||||
|
```bash
|
||||||
|
$ pg_ctl -D db stop
|
||||||
|
```
|
||||||
|
|
||||||
|
### Loading Data
|
||||||
|
|
||||||
|
To load all exported coach data into a local postgres instance, use the provided
|
||||||
|
`sql/load_export.sql` file. First concatenate all exported content:
|
||||||
|
```bash
|
||||||
|
$ cat data/{chesscom,lichess}/export.json > data/export.json
|
||||||
|
```
|
||||||
|
Then (assuming your database cluster has been initialized at `@scraper`), you
|
||||||
|
can run:
|
||||||
|
```bash
|
||||||
|
$ psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
|
||||||
|
```
|
||||||
|
|
||||||
|
### E2E
|
||||||
|
|
||||||
|
With the above section on loading files, we now have the individual components
|
||||||
|
necessary to scrape coach data from our chess website and dump the results into
|
||||||
|
the database in one fell swoop. Assuming our database is open with a socket
|
||||||
|
connection available at `@scraper`:
|
||||||
|
```bash
|
||||||
|
nix run . -- --user-agent <your-email> -s chesscom
|
||||||
|
nix run . -- --user-agent <your-email> -s lichess
|
||||||
|
cat data/{chesscom,lichess}/export.json > data/export.json
|
||||||
|
psql -h @scraper -f sql/load_export.sql -v export="'$PWD/data/export.json'"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
@ -49,7 +97,7 @@ dependency management handled by poetry (version 1.7.0). [direnv](https://direnv
|
||||||
can be used to a launch a dev shell upon entering this directory (refer to
|
can be used to a launch a dev shell upon entering this directory (refer to
|
||||||
`.envrc`). Otherwise run via:
|
`.envrc`). Otherwise run via:
|
||||||
```bash
|
```bash
|
||||||
$> nix develop
|
$ nix develop
|
||||||
```
|
```
|
||||||
|
|
||||||
### Language Server
|
### Language Server
|
||||||
|
@ -67,7 +115,7 @@ Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.h
|
||||||
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
|
(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can
|
||||||
be used to format all `*.py` files prior to commit. Install via:
|
be used to format all `*.py` files prior to commit. Install via:
|
||||||
```bash
|
```bash
|
||||||
$> git config --local core.hooksPath .githooks/
|
$ git config --local core.hooksPath .githooks/
|
||||||
```
|
```
|
||||||
If running [direnv](https://direnv.net/), this hook is installed automatically
|
If running [direnv](https://direnv.net/), this hook is installed automatically
|
||||||
when entering the directory.
|
when entering the directory.
|
||||||
|
|
|
@ -41,14 +41,15 @@ async def run():
|
||||||
scraper = LichessScraper(session)
|
scraper = LichessScraper(session)
|
||||||
exporter_cls = LichessExporter
|
exporter_cls = LichessExporter
|
||||||
|
|
||||||
dump = {}
|
# Write out each coach data into NDJSON file.
|
||||||
|
dump = []
|
||||||
usernames = await scraper.scrape()
|
usernames = await scraper.scrape()
|
||||||
for username in usernames:
|
for username in usernames:
|
||||||
dump[username] = exporter_cls(username).export()
|
export = exporter_cls(username).export()
|
||||||
|
dump.append(f"{json.dumps(export)}\n")
|
||||||
|
|
||||||
with open(scraper.path_site_file("export.json"), "w") as f:
|
with open(scraper.path_site_file("export.json"), "w") as f:
|
||||||
json.dump(dump, f, indent=2)
|
f.writelines(dump)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -36,6 +36,8 @@ class BaseExporter(Repo):
|
||||||
"""Transform coach-specific data into uniform format."""
|
"""Transform coach-specific data into uniform format."""
|
||||||
export: Export = {}
|
export: Export = {}
|
||||||
|
|
||||||
|
_insert(export, "site", self.site)
|
||||||
|
_insert(export, "username", self.username)
|
||||||
_insert(export, "rapid", self.export_rapid())
|
_insert(export, "rapid", self.export_rapid())
|
||||||
_insert(export, "blitz", self.export_blitz())
|
_insert(export, "blitz", self.export_blitz())
|
||||||
_insert(export, "bullet", self.export_bullet())
|
_insert(export, "bullet", self.export_bullet())
|
||||||
|
|
|
@ -209,4 +209,7 @@ class Exporter(BaseExporter):
|
||||||
if value[-1] == "?":
|
if value[-1] == "?":
|
||||||
value = value[:-1]
|
value = value[:-1]
|
||||||
|
|
||||||
return value
|
try:
|
||||||
|
return int(value)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
14
flake.nix
14
flake.nix
|
@ -1,12 +1,9 @@
|
||||||
{
|
{
|
||||||
description = ''
|
description = ''
|
||||||
An opinionated poetry flake.
|
A web scraper for chess coaches.
|
||||||
|
|
||||||
This flake has been adapted from the `app` template found in:
|
To generate a copy of this template elsewhere, refer to:
|
||||||
https://github.com/nix-community/poetry2nix
|
https://github.com/jrpotter/bootstrap
|
||||||
|
|
||||||
To generate a copy of this template elsewhere, run:
|
|
||||||
$> bootstrap poetry
|
|
||||||
'';
|
'';
|
||||||
|
|
||||||
inputs = {
|
inputs = {
|
||||||
|
@ -78,8 +75,9 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
packages = [
|
packages = with pkgs; [
|
||||||
pkgs.poetry
|
poetry
|
||||||
|
postgresql_15
|
||||||
] ++ (with pkgs.python311Packages; [
|
] ++ (with pkgs.python311Packages; [
|
||||||
black
|
black
|
||||||
debugpy
|
debugpy
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
CREATE SCHEMA IF NOT EXISTS coach_scraper;
|
||||||
|
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
EXECUTE format(
|
||||||
|
'ALTER TABLE IF EXISTS coach_scraper.export '
|
||||||
|
'RENAME TO export_%s;',
|
||||||
|
TRUNC(EXTRACT(EPOCH FROM CURRENT_TIMESTAMP), 0)
|
||||||
|
);
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
CREATE TABLE coach_scraper.export
|
||||||
|
( username VARCHAR(255) NOT NULL
|
||||||
|
, site VARCHAR(16) NOT NULL
|
||||||
|
, rapid INT
|
||||||
|
, blitz INT
|
||||||
|
, bullet INT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
|
||||||
|
|
||||||
|
SELECT format(
|
||||||
|
$$COPY pg_temp.coach_scraper_export (data) from %L$$,
|
||||||
|
:export
|
||||||
|
) \gexec
|
||||||
|
|
||||||
|
INSERT INTO coach_scraper.export
|
||||||
|
SELECT
|
||||||
|
data->>'username',
|
||||||
|
data->>'site',
|
||||||
|
(data->>'rapid')::INT,
|
||||||
|
(data->>'blitz')::INT,
|
||||||
|
(data->>'bullet')::INT
|
||||||
|
FROM
|
||||||
|
pg_temp.coach_scraper_export;
|
Loading…
Reference in New Issue