From 63764a22c49a5240ca1d3347d9245c78f3f69612 Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Mon, 4 Dec 2023 15:08:17 -0700 Subject: [PATCH] Transition to a CSV; Postgres can handle that better. --- README.md | 27 +++++++++------- app/__main__.py | 44 ++++++++++++++++---------- app/chesscom.py | 35 ++++++++++++++++++++- app/exporter.py | 24 +++++++++++--- app/lichess.py | 84 +++++++++++++++++++++++++++++++------------------ sql/export.sql | 34 ++++++++++++++------ sql/init.sql | 25 +++------------ 7 files changed, 179 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index ee73333..6ba332f 100644 --- a/README.md +++ b/README.md @@ -29,17 +29,19 @@ data ├── ... ``` -## Usage +## Quickstart If you have nix available, run: ```bash -$ nix run . -- --user-agent -s +$ nix run . -- --user-agent -s [-s ...] ``` If not, ensure you have [poetry](https://python-poetry.org/) on your machine and instead run the following: ```bash -$ poetry run python3 -m app -u -s +$ poetry run python3 -m app -u -s [-s ...] ``` +After running (this may take several hours), a new CSV will be generated at +`data/export.csv` containing all scraped content from the specified ``s. ## Database @@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided ```bash $ psql -h @scraper -f sql/init.sql ``` -Next, concatenate all exported content and dump into the newly created table: +Next, dump exported data into the newly created table: ```bash -$ cat data/{chesscom,lichess}/export.json > data/export.json -$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'" +$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'" +``` +Re-running the `sql/export.sql` script will create a backup of the +`coach_scraper.export` table. It will then upsert the scraped data. You can view +all backups from the `psql` console like so: +``` +postgres=# \dt coach_scraper.export* ``` -Re-running will automatically create backups and replace the coach data found -in `coach_scraper.export`. ### E2E @@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into the database in one fell swoop. Assuming our database is open with a socket connection available at `@scraper`: ```bash -nix run . -- --user-agent -s chesscom -s lichess -cat data/{chesscom,lichess}/export.json > data/export.json -psql -h @scraper -f sql/init.sql -psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'" +$ nix run . -- --user-agent -s chesscom -s lichess +$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'" ``` ## Development diff --git a/app/__main__.py b/app/__main__.py index 45a5762..361adf1 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -1,5 +1,6 @@ import argparse import asyncio +import csv import json import aiohttp @@ -32,25 +33,34 @@ async def run(): async with aiohttp.ClientSession( headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"} ) as session: - for site in set(args.site): - scraper, exporter_cls = None, None + with open("data/export.csv", "w") as f: + writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL) + for site in set(args.site): + scraper, exporter_cls = None, None - if site == Site.CHESSCOM.value: - scraper = ChesscomScraper(session) - exporter_cls = ChesscomExporter - elif site == Site.LICHESS.value: - scraper = LichessScraper(session) - exporter_cls = LichessExporter + if site == Site.CHESSCOM.value: + scraper = ChesscomScraper(session) + exporter_cls = ChesscomExporter + elif site == Site.LICHESS.value: + scraper = LichessScraper(session) + exporter_cls = LichessExporter - # Write out each coach data into NDJSON file. - dump = [] - usernames = await scraper.scrape() - for username in usernames: - export = exporter_cls(username).export() - dump.append(f"{json.dumps(export)}\n") - - with open(scraper.path_site_file("export.json"), "w") as f: - f.writelines(dump) + usernames = await scraper.scrape() + for username in usernames: + export = exporter_cls(username).export() + writer.writerow( + [ + # This should match the order data is loaded in the + # sql/export.sql script. + export["site"], + export["username"], + export.get("name", ""), + export.get("image_url", ""), + export.get("rapid", ""), + export.get("blitz", ""), + export.get("bullet", ""), + ] + ) def main(): diff --git a/app/chesscom.py b/app/chesscom.py index 8209ba0..ae9ac30 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -5,7 +5,7 @@ import os.path from typing import List, Union import aiohttp -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, SoupStrainer from app.exporter import BaseExporter from app.repo import AnsiColor, Site @@ -172,10 +172,27 @@ class Scraper(BaseScraper): return True +def _profile_filter(elem, attrs): + """Includes only relevant segments of the `{username}.html` file.""" + if "profile-header-info" in attrs.get("class", ""): + return True + if "profile-card-info" in attrs.get("class", ""): + return True + + class Exporter(BaseExporter): def __init__(self, username: str): super().__init__(site=Site.CHESSCOM.value, username=username) + self.profile_soup = None + try: + with open(self.path_coach_file(username, f"{username}.html"), "r") as f: + self.profile_soup = BeautifulSoup( + f.read(), "lxml", parse_only=SoupStrainer(_profile_filter) + ) + except FileNotFoundError: + pass + self.stats_json = {} try: with open(self.path_coach_file(username, "stats.json"), "r") as f: @@ -185,6 +202,22 @@ class Exporter(BaseExporter): except FileNotFoundError: pass + def export_name(self) -> Union[str, None]: + try: + name = self.profile_soup.find("div", class_="profile-card-name") + return name.get_text().strip() + except AttributeError: + return None + + def export_image_url(self) -> Union[str, None]: + try: + div = self.profile_soup.find("div", class_="profile-header-avatar") + src = div.find("img").get("src", "") + if "images.chesscomfiles.com" in src: + return src + except AttributeError: + return None + def export_rapid(self) -> Union[int, None]: return self.stats_json.get("rapid", {}).get("rating") diff --git a/app/exporter.py b/app/exporter.py index c882482..9e6d200 100644 --- a/app/exporter.py +++ b/app/exporter.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Any, Union from typing_extensions import TypedDict @@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo class Export(TypedDict, total=False): + # The (site, username) make up a unique key for each coach. + site: str + username: str + # The coach's real name. + name: str + # The profile image used on the source site. + image_url: str # The coach's rapid rating relative to the site they were sourced from. rapid: int # The coach's blitz rating relative to the site they were sourced from. @@ -14,10 +21,9 @@ class Export(TypedDict, total=False): bullet: int -def _insert(export: Export, key: str, value: any): - if value is None: - return - export[key] = value +def _insert(export: Export, key: str, value: Any): + if value is not None: + export[key] = value class BaseExporter(Repo): @@ -25,6 +31,12 @@ class BaseExporter(Repo): super().__init__(site) self.username = username + def export_name(self) -> Union[str, None]: + raise NotImplementedError() + + def export_image_url(self) -> Union[str, None]: + raise NotImplementedError() + def export_rapid(self) -> Union[int, None]: raise NotImplementedError() @@ -40,6 +52,8 @@ class BaseExporter(Repo): _insert(export, "site", self.site) _insert(export, "username", self.username) + _insert(export, "name", self.export_name()) + _insert(export, "image_url", self.export_image_url()) _insert(export, "rapid", self.export_rapid()) _insert(export, "blitz", self.export_blitz()) _insert(export, "bullet", self.export_bullet()) diff --git a/app/lichess.py b/app/lichess.py index b2af7b7..22cb86d 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -1,7 +1,7 @@ import asyncio import os import os.path -from typing import List +from typing import List, Union import aiohttp from bs4 import BeautifulSoup, SoupStrainer @@ -97,9 +97,9 @@ class Scraper(BaseScraper): soup = BeautifulSoup(response, "lxml") members = soup.find_all("article", class_="coach-widget") for member in members: - anchor = member.find("a", class_="overlay") - if anchor: - href = anchor.get("href") + a = member.find("a", class_="overlay") + if a: + href = a.get("href") username = href[len("/coach/") :] usernames.append(username) @@ -163,8 +163,16 @@ class Scraper(BaseScraper): return True +def _profile_filter(elem, attrs): + """Includes only relevant segments of the `{username}.html` file.""" + if "coach-widget" in attrs.get("class", ""): + return True + + def _stats_filter(elem, attrs): """Includes only relevant segments of the `stats.html` file.""" + if "profile-side" in attrs.get("class", ""): + return True if "sub-ratings" in attrs.get("class", ""): return True @@ -173,43 +181,59 @@ class Exporter(BaseExporter): def __init__(self, username: str): super().__init__(site=Site.LICHESS.value, username=username) - self.stats_soup = None + self.profile_soup = None try: - with open(self.path_coach_file(username, "stats.html"), "r") as f: - stats_strainer = SoupStrainer(_stats_filter) - self.stats_soup = BeautifulSoup( - f.read(), "lxml", parse_only=stats_strainer + with open(self.path_coach_file(username, f"{username}.html"), "r") as f: + self.profile_soup = BeautifulSoup( + f.read(), "lxml", parse_only=SoupStrainer(_profile_filter) ) except FileNotFoundError: pass - def export_rapid(self): + self.stats_soup = None + try: + with open(self.path_coach_file(username, "stats.html"), "r") as f: + self.stats_soup = BeautifulSoup( + f.read(), "lxml", parse_only=SoupStrainer(_stats_filter) + ) + except FileNotFoundError: + pass + + def export_name(self) -> Union[str, None]: + try: + profile_side = self.stats_soup.find("div", class_="profile-side") + user_infos = profile_side.find("div", class_="user-infos") + name = user_infos.find("strong", class_="name") + return name.get_text().strip() + except AttributeError: + return None + + def export_image_url(self) -> Union[str, None]: + try: + picture = self.profile_soup.find("img", class_="picture") + src = picture.get("src", "") + if "image.lichess1.org" in src: + return src + except AttributeError: + return None + + def export_rapid(self) -> Union[int, None]: return self._find_rating("rapid") - def export_blitz(self): + def export_blitz(self) -> Union[int, None]: return self._find_rating("blitz") - def export_bullet(self): + def export_bullet(self) -> Union[int, None]: return self._find_rating("bullet") - def _find_rating(self, name): - if self.stats_soup is None: - return None - - anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}") - if anchor is None: - return None - rating = anchor.find("rating") - if rating is None: - return None - strong = rating.find("strong") - if strong is None: - return None - value = strong.get_text() - if value[-1] == "?": - value = value[:-1] - + def _find_rating(self, name) -> Union[int, None]: try: + a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}") + rating = a.find("rating") + strong = rating.find("strong") + value = strong.get_text() + if value[-1] == "?": + value = value[:-1] return int(value) - except ValueError: + except (AttributeError, ValueError): return None diff --git a/sql/export.sql b/sql/export.sql index b5079e0..be0aec0 100644 --- a/sql/export.sql +++ b/sql/export.sql @@ -7,31 +7,47 @@ DO $$ END; $$ LANGUAGE plpgsql; -CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB); +-- This should match the order data is written in the app/__main__.py +-- script. +CREATE TEMPORARY TABLE pg_temp.coach_scraper_export + ( site TEXT + , username TEXT + , name TEXT + , image_url TEXT + , rapid TEXT + , blitz TEXT + , bullet TEXT + ); SELECT format( - $$COPY pg_temp.coach_scraper_export (data) from %L$$, + $$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$, :export ) \gexec INSERT INTO coach_scraper.export - ( username - , site + ( site + , username + , name + , image_url , rapid , blitz , bullet ) SELECT - data->>'username', - data->>'site', - (data->>'rapid')::INT, - (data->>'blitz')::INT, - (data->>'bullet')::INT + site, + username, + name, + image_url, + rapid::INT, + blitz::INT, + bullet::INT FROM pg_temp.coach_scraper_export ON CONFLICT (site, username) DO UPDATE SET + name = EXCLUDED.name, + image_url = EXCLUDED.image_url, rapid = EXCLUDED.rapid, blitz = EXCLUDED.blitz, bullet = EXCLUDED.bullet; diff --git a/sql/init.sql b/sql/init.sql index 8d19d93..ccefcde 100644 --- a/sql/init.sql +++ b/sql/init.sql @@ -1,9 +1,13 @@ CREATE SCHEMA IF NOT EXISTS coach_scraper; -CREATE TABLE IF NOT EXISTS coach_scraper.export +DROP TABLE IF EXISTS coach_scraper.export; + +CREATE TABLE coach_scraper.export ( id SERIAL PRIMARY KEY , site VARCHAR(16) NOT NULL , username VARCHAR(255) NOT NULL + , name VARCHAR(255) + , image_url TEXT , rapid INT , blitz INT , bullet INT @@ -15,22 +19,3 @@ ON coach_scraper.export USING BTREE (site, username); - -DO $$ - BEGIN - IF NOT EXISTS ( - SELECT 1 - FROM information_schema.constraint_column_usage - WHERE table_schema = 'coach_scraper' - AND table_name = 'export' - AND constraint_name = 'site_username_unique' - ) THEN - EXECUTE 'ALTER TABLE - coach_scraper.export - ADD CONSTRAINT - site_username_unique - UNIQUE USING INDEX - site_username_unique'; - END IF; - END; -$$ LANGUAGE plpgsql;