Transition to a CSV; Postgres can handle that better.

pull/9/head
Joshua Potter 2023-12-04 15:08:17 -07:00
parent ec94a16140
commit 63764a22c4
7 changed files with 179 additions and 94 deletions

View File

@ -29,17 +29,19 @@ data
├── ...
```
## Usage
## Quickstart
If you have nix available, run:
```bash
$ nix run . -- --user-agent <your-email> -s <site>
$ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
```
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
instead run the following:
```bash
$ poetry run python3 -m app -u <your-email> -s <site>
$ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
```
After running (this may take several hours), a new CSV will be generated at
`data/export.csv` containing all scraped content from the specified `<site>`s.
## Database
@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
```bash
$ psql -h @scraper -f sql/init.sql
```
Next, concatenate all exported content and dump into the newly created table:
Next, dump exported data into the newly created table:
```bash
$ cat data/{chesscom,lichess}/export.json > data/export.json
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
```
Re-running the `sql/export.sql` script will create a backup of the
`coach_scraper.export` table. It will then upsert the scraped data. You can view
all backups from the `psql` console like so:
```
postgres=# \dt coach_scraper.export*
```
Re-running will automatically create backups and replace the coach data found
in `coach_scraper.export`.
### E2E
@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
the database in one fell swoop. Assuming our database is open with a socket
connection available at `@scraper`:
```bash
nix run . -- --user-agent <your-email> -s chesscom -s lichess
cat data/{chesscom,lichess}/export.json > data/export.json
psql -h @scraper -f sql/init.sql
psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
$ nix run . -- --user-agent <your-email> -s chesscom -s lichess
$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
```
## Development

View File

@ -1,5 +1,6 @@
import argparse
import asyncio
import csv
import json
import aiohttp
@ -32,25 +33,34 @@ async def run():
async with aiohttp.ClientSession(
headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
) as session:
for site in set(args.site):
scraper, exporter_cls = None, None
with open("data/export.csv", "w") as f:
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
for site in set(args.site):
scraper, exporter_cls = None, None
if site == Site.CHESSCOM.value:
scraper = ChesscomScraper(session)
exporter_cls = ChesscomExporter
elif site == Site.LICHESS.value:
scraper = LichessScraper(session)
exporter_cls = LichessExporter
if site == Site.CHESSCOM.value:
scraper = ChesscomScraper(session)
exporter_cls = ChesscomExporter
elif site == Site.LICHESS.value:
scraper = LichessScraper(session)
exporter_cls = LichessExporter
# Write out each coach data into NDJSON file.
dump = []
usernames = await scraper.scrape()
for username in usernames:
export = exporter_cls(username).export()
dump.append(f"{json.dumps(export)}\n")
with open(scraper.path_site_file("export.json"), "w") as f:
f.writelines(dump)
usernames = await scraper.scrape()
for username in usernames:
export = exporter_cls(username).export()
writer.writerow(
[
# This should match the order data is loaded in the
# sql/export.sql script.
export["site"],
export["username"],
export.get("name", ""),
export.get("image_url", ""),
export.get("rapid", ""),
export.get("blitz", ""),
export.get("bullet", ""),
]
)
def main():

View File

@ -5,7 +5,7 @@ import os.path
from typing import List, Union
import aiohttp
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, SoupStrainer
from app.exporter import BaseExporter
from app.repo import AnsiColor, Site
@ -172,10 +172,27 @@ class Scraper(BaseScraper):
return True
def _profile_filter(elem, attrs):
"""Includes only relevant segments of the `{username}.html` file."""
if "profile-header-info" in attrs.get("class", ""):
return True
if "profile-card-info" in attrs.get("class", ""):
return True
class Exporter(BaseExporter):
def __init__(self, username: str):
super().__init__(site=Site.CHESSCOM.value, username=username)
self.profile_soup = None
try:
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
self.profile_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
)
except FileNotFoundError:
pass
self.stats_json = {}
try:
with open(self.path_coach_file(username, "stats.json"), "r") as f:
@ -185,6 +202,22 @@ class Exporter(BaseExporter):
except FileNotFoundError:
pass
def export_name(self) -> Union[str, None]:
try:
name = self.profile_soup.find("div", class_="profile-card-name")
return name.get_text().strip()
except AttributeError:
return None
def export_image_url(self) -> Union[str, None]:
try:
div = self.profile_soup.find("div", class_="profile-header-avatar")
src = div.find("img").get("src", "")
if "images.chesscomfiles.com" in src:
return src
except AttributeError:
return None
def export_rapid(self) -> Union[int, None]:
return self.stats_json.get("rapid", {}).get("rating")

View File

@ -1,4 +1,4 @@
from typing import Union
from typing import Any, Union
from typing_extensions import TypedDict
@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo
class Export(TypedDict, total=False):
# The (site, username) make up a unique key for each coach.
site: str
username: str
# The coach's real name.
name: str
# The profile image used on the source site.
image_url: str
# The coach's rapid rating relative to the site they were sourced from.
rapid: int
# The coach's blitz rating relative to the site they were sourced from.
@ -14,10 +21,9 @@ class Export(TypedDict, total=False):
bullet: int
def _insert(export: Export, key: str, value: any):
if value is None:
return
export[key] = value
def _insert(export: Export, key: str, value: Any):
if value is not None:
export[key] = value
class BaseExporter(Repo):
@ -25,6 +31,12 @@ class BaseExporter(Repo):
super().__init__(site)
self.username = username
def export_name(self) -> Union[str, None]:
raise NotImplementedError()
def export_image_url(self) -> Union[str, None]:
raise NotImplementedError()
def export_rapid(self) -> Union[int, None]:
raise NotImplementedError()
@ -40,6 +52,8 @@ class BaseExporter(Repo):
_insert(export, "site", self.site)
_insert(export, "username", self.username)
_insert(export, "name", self.export_name())
_insert(export, "image_url", self.export_image_url())
_insert(export, "rapid", self.export_rapid())
_insert(export, "blitz", self.export_blitz())
_insert(export, "bullet", self.export_bullet())

View File

@ -1,7 +1,7 @@
import asyncio
import os
import os.path
from typing import List
from typing import List, Union
import aiohttp
from bs4 import BeautifulSoup, SoupStrainer
@ -97,9 +97,9 @@ class Scraper(BaseScraper):
soup = BeautifulSoup(response, "lxml")
members = soup.find_all("article", class_="coach-widget")
for member in members:
anchor = member.find("a", class_="overlay")
if anchor:
href = anchor.get("href")
a = member.find("a", class_="overlay")
if a:
href = a.get("href")
username = href[len("/coach/") :]
usernames.append(username)
@ -163,8 +163,16 @@ class Scraper(BaseScraper):
return True
def _profile_filter(elem, attrs):
"""Includes only relevant segments of the `{username}.html` file."""
if "coach-widget" in attrs.get("class", ""):
return True
def _stats_filter(elem, attrs):
"""Includes only relevant segments of the `stats.html` file."""
if "profile-side" in attrs.get("class", ""):
return True
if "sub-ratings" in attrs.get("class", ""):
return True
@ -173,43 +181,59 @@ class Exporter(BaseExporter):
def __init__(self, username: str):
super().__init__(site=Site.LICHESS.value, username=username)
self.stats_soup = None
self.profile_soup = None
try:
with open(self.path_coach_file(username, "stats.html"), "r") as f:
stats_strainer = SoupStrainer(_stats_filter)
self.stats_soup = BeautifulSoup(
f.read(), "lxml", parse_only=stats_strainer
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
self.profile_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
)
except FileNotFoundError:
pass
def export_rapid(self):
self.stats_soup = None
try:
with open(self.path_coach_file(username, "stats.html"), "r") as f:
self.stats_soup = BeautifulSoup(
f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
)
except FileNotFoundError:
pass
def export_name(self) -> Union[str, None]:
try:
profile_side = self.stats_soup.find("div", class_="profile-side")
user_infos = profile_side.find("div", class_="user-infos")
name = user_infos.find("strong", class_="name")
return name.get_text().strip()
except AttributeError:
return None
def export_image_url(self) -> Union[str, None]:
try:
picture = self.profile_soup.find("img", class_="picture")
src = picture.get("src", "")
if "image.lichess1.org" in src:
return src
except AttributeError:
return None
def export_rapid(self) -> Union[int, None]:
return self._find_rating("rapid")
def export_blitz(self):
def export_blitz(self) -> Union[int, None]:
return self._find_rating("blitz")
def export_bullet(self):
def export_bullet(self) -> Union[int, None]:
return self._find_rating("bullet")
def _find_rating(self, name):
if self.stats_soup is None:
return None
anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
if anchor is None:
return None
rating = anchor.find("rating")
if rating is None:
return None
strong = rating.find("strong")
if strong is None:
return None
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
def _find_rating(self, name) -> Union[int, None]:
try:
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
rating = a.find("rating")
strong = rating.find("strong")
value = strong.get_text()
if value[-1] == "?":
value = value[:-1]
return int(value)
except ValueError:
except (AttributeError, ValueError):
return None

View File

@ -7,31 +7,47 @@ DO $$
END;
$$ LANGUAGE plpgsql;
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
-- This should match the order data is written in the app/__main__.py
-- script.
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
( site TEXT
, username TEXT
, name TEXT
, image_url TEXT
, rapid TEXT
, blitz TEXT
, bullet TEXT
);
SELECT format(
$$COPY pg_temp.coach_scraper_export (data) from %L$$,
$$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
:export
) \gexec
INSERT INTO coach_scraper.export
( username
, site
( site
, username
, name
, image_url
, rapid
, blitz
, bullet
)
SELECT
data->>'username',
data->>'site',
(data->>'rapid')::INT,
(data->>'blitz')::INT,
(data->>'bullet')::INT
site,
username,
name,
image_url,
rapid::INT,
blitz::INT,
bullet::INT
FROM
pg_temp.coach_scraper_export
ON CONFLICT
(site, username)
DO UPDATE SET
name = EXCLUDED.name,
image_url = EXCLUDED.image_url,
rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz,
bullet = EXCLUDED.bullet;

View File

@ -1,9 +1,13 @@
CREATE SCHEMA IF NOT EXISTS coach_scraper;
CREATE TABLE IF NOT EXISTS coach_scraper.export
DROP TABLE IF EXISTS coach_scraper.export;
CREATE TABLE coach_scraper.export
( id SERIAL PRIMARY KEY
, site VARCHAR(16) NOT NULL
, username VARCHAR(255) NOT NULL
, name VARCHAR(255)
, image_url TEXT
, rapid INT
, blitz INT
, bullet INT
@ -15,22 +19,3 @@ ON
coach_scraper.export
USING
BTREE (site, username);
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1
FROM information_schema.constraint_column_usage
WHERE table_schema = 'coach_scraper'
AND table_name = 'export'
AND constraint_name = 'site_username_unique'
) THEN
EXECUTE 'ALTER TABLE
coach_scraper.export
ADD CONSTRAINT
site_username_unique
UNIQUE USING INDEX
site_username_unique';
END IF;
END;
$$ LANGUAGE plpgsql;