Transition to a CSV; Postgres can handle that better.

2023-12-04 15:08:17 -07:00 · 2023-12-04 15:08:17 -07:00 · 63764a22c4
parent ec94a16140
commit 63764a22c4
7 changed files with 179 additions and 94 deletions
--- a/README.md
+++ b/README.md
@ -29,17 +29,19 @@ data
    ├── ...
 ```
-## Usage
+## Quickstart
 If you have nix available, run:
 ```bash
-$ nix run . -- --user-agent <your-email> -s <site>
+$ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
 ```
 If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
 instead run the following:
 ```bash
-$ poetry run python3 -m app -u <your-email> -s <site>
+$ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
 ```
 After running (this may take several hours), a new CSV will be generated at
 `data/export.csv` containing all scraped content from the specified `<site>`s.
 ## Database
@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
 ```bash
 $ psql -h @scraper -f sql/init.sql
 ```
-Next, concatenate all exported content and dump into the newly created table:
+Next, dump exported data into the newly created table:
 ```bash
-$ cat data/{chesscom,lichess}/export.json > data/export.json
+$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
-$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
+```
 Re-running the `sql/export.sql` script will create a backup of the
 `coach_scraper.export` table. It will then upsert the scraped data. You can view
 all backups from the `psql` console like so:
 ```
 postgres=# \dt coach_scraper.export*
 ```
 Re-running will automatically create backups and replace the coach data found
 in `coach_scraper.export`.
 ### E2E
@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
 the database in one fell swoop. Assuming our database is open with a socket
 connection available at `@scraper`:
 ```bash
-nix run . -- --user-agent <your-email> -s chesscom -s lichess
+$ nix run . -- --user-agent <your-email> -s chesscom -s lichess
-cat data/{chesscom,lichess}/export.json > data/export.json
+$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
 psql -h @scraper -f sql/init.sql
 psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
 ```
 ## Development
--- a/app/main.py
+++ b/app/main.py
@ -1,5 +1,6 @@
 import argparse
 import asyncio
 import csv
 import json
 import aiohttp
@ -32,25 +33,34 @@ async def run():
    async with aiohttp.ClientSession(
        headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
    ) as session:
-        for site in set(args.site):
+        with open("data/export.csv", "w") as f:
-            scraper, exporter_cls = None, None
+            writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
            for site in set(args.site):
                scraper, exporter_cls = None, None
-            if site == Site.CHESSCOM.value:
+                if site == Site.CHESSCOM.value:
-                scraper = ChesscomScraper(session)
+                    scraper = ChesscomScraper(session)
-                exporter_cls = ChesscomExporter
+                    exporter_cls = ChesscomExporter
-            elif site == Site.LICHESS.value:
+                elif site == Site.LICHESS.value:
-                scraper = LichessScraper(session)
+                    scraper = LichessScraper(session)
-                exporter_cls = LichessExporter
+                    exporter_cls = LichessExporter
-            # Write out each coach data into NDJSON file.
+                usernames = await scraper.scrape()
-            dump = []
+                for username in usernames:
-            usernames = await scraper.scrape()
+                    export = exporter_cls(username).export()
-            for username in usernames:
+                    writer.writerow(
-                export = exporter_cls(username).export()
+                        [
-                dump.append(f"{json.dumps(export)}\n")
+                            # This should match the order data is loaded in the
-
+                            # sql/export.sql script.
-            with open(scraper.path_site_file("export.json"), "w") as f:
+                            export["site"],
-                f.writelines(dump)
+                            export["username"],
                            export.get("name", ""),
                            export.get("image_url", ""),
                            export.get("rapid", ""),
                            export.get("blitz", ""),
                            export.get("bullet", ""),
                        ]
                    )
 def main():
--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -5,7 +5,7 @@ import os.path
 from typing import List, Union
 import aiohttp
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer
 from app.exporter import BaseExporter
 from app.repo import AnsiColor, Site
@ -172,10 +172,27 @@ class Scraper(BaseScraper):
        return True
 def _profile_filter(elem, attrs):
    """Includes only relevant segments of the `{username}.html` file."""
    if "profile-header-info" in attrs.get("class", ""):
        return True
    if "profile-card-info" in attrs.get("class", ""):
        return True
 class Exporter(BaseExporter):
    def __init__(self, username: str):
        super().__init__(site=Site.CHESSCOM.value, username=username)
        self.profile_soup = None
        try:
            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
                self.profile_soup = BeautifulSoup(
                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                )
        except FileNotFoundError:
            pass
        self.stats_json = {}
        try:
            with open(self.path_coach_file(username, "stats.json"), "r") as f:
@ -185,6 +202,22 @@ class Exporter(BaseExporter):
        except FileNotFoundError:
            pass
    def export_name(self) -> Union[str, None]:
        try:
            name = self.profile_soup.find("div", class_="profile-card-name")
            return name.get_text().strip()
        except AttributeError:
            return None
    def export_image_url(self) -> Union[str, None]:
        try:
            div = self.profile_soup.find("div", class_="profile-header-avatar")
            src = div.find("img").get("src", "")
            if "images.chesscomfiles.com" in src:
                return src
        except AttributeError:
            return None
    def export_rapid(self) -> Union[int, None]:
        return self.stats_json.get("rapid", {}).get("rating")
--- a/app/exporter.py
+++ b/app/exporter.py
@ -1,4 +1,4 @@
-from typing import Union
+from typing import Any, Union
 from typing_extensions import TypedDict
@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo
 class Export(TypedDict, total=False):
    # The (site, username) make up a unique key for each coach.
    site: str
    username: str
    # The coach's real name.
    name: str
    # The profile image used on the source site.
    image_url: str
    # The coach's rapid rating relative to the site they were sourced from.
    rapid: int
    # The coach's blitz rating relative to the site they were sourced from.
@ -14,10 +21,9 @@ class Export(TypedDict, total=False):
    bullet: int
-def _insert(export: Export, key: str, value: any):
+def _insert(export: Export, key: str, value: Any):
-    if value is None:
+    if value is not None:
-        return
+        export[key] = value
    export[key] = value
 class BaseExporter(Repo):
@ -25,6 +31,12 @@ class BaseExporter(Repo):
        super().__init__(site)
        self.username = username
    def export_name(self) -> Union[str, None]:
        raise NotImplementedError()
    def export_image_url(self) -> Union[str, None]:
        raise NotImplementedError()
    def export_rapid(self) -> Union[int, None]:
        raise NotImplementedError()
@ -40,6 +52,8 @@ class BaseExporter(Repo):
        _insert(export, "site", self.site)
        _insert(export, "username", self.username)
        _insert(export, "name", self.export_name())
        _insert(export, "image_url", self.export_image_url())
        _insert(export, "rapid", self.export_rapid())
        _insert(export, "blitz", self.export_blitz())
        _insert(export, "bullet", self.export_bullet())
--- a/app/lichess.py
+++ b/app/lichess.py
@ -1,7 +1,7 @@
 import asyncio
 import os
 import os.path
-from typing import List
+from typing import List, Union
 import aiohttp
 from bs4 import BeautifulSoup, SoupStrainer
@ -97,9 +97,9 @@ class Scraper(BaseScraper):
        soup = BeautifulSoup(response, "lxml")
        members = soup.find_all("article", class_="coach-widget")
        for member in members:
-            anchor = member.find("a", class_="overlay")
+            a = member.find("a", class_="overlay")
-            if anchor:
+            if a:
-                href = anchor.get("href")
+                href = a.get("href")
                username = href[len("/coach/") :]
                usernames.append(username)
@ -163,8 +163,16 @@ class Scraper(BaseScraper):
        return True
 def _profile_filter(elem, attrs):
    """Includes only relevant segments of the `{username}.html` file."""
    if "coach-widget" in attrs.get("class", ""):
        return True
 def _stats_filter(elem, attrs):
    """Includes only relevant segments of the `stats.html` file."""
    if "profile-side" in attrs.get("class", ""):
        return True
    if "sub-ratings" in attrs.get("class", ""):
        return True
@ -173,43 +181,59 @@ class Exporter(BaseExporter):
    def __init__(self, username: str):
        super().__init__(site=Site.LICHESS.value, username=username)
-        self.stats_soup = None
+        self.profile_soup = None
        try:
-            with open(self.path_coach_file(username, "stats.html"), "r") as f:
+            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
-                stats_strainer = SoupStrainer(_stats_filter)
+                self.profile_soup = BeautifulSoup(
-                self.stats_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                    f.read(), "lxml", parse_only=stats_strainer
                )
        except FileNotFoundError:
            pass
-    def export_rapid(self):
+        self.stats_soup = None
        try:
            with open(self.path_coach_file(username, "stats.html"), "r") as f:
                self.stats_soup = BeautifulSoup(
                    f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
                )
        except FileNotFoundError:
            pass
    def export_name(self) -> Union[str, None]:
        try:
            profile_side = self.stats_soup.find("div", class_="profile-side")
            user_infos = profile_side.find("div", class_="user-infos")
            name = user_infos.find("strong", class_="name")
            return name.get_text().strip()
        except AttributeError:
            return None
    def export_image_url(self) -> Union[str, None]:
        try:
            picture = self.profile_soup.find("img", class_="picture")
            src = picture.get("src", "")
            if "image.lichess1.org" in src:
                return src
        except AttributeError:
            return None
    def export_rapid(self) -> Union[int, None]:
        return self._find_rating("rapid")
-    def export_blitz(self):
+    def export_blitz(self) -> Union[int, None]:
        return self._find_rating("blitz")
-    def export_bullet(self):
+    def export_bullet(self) -> Union[int, None]:
        return self._find_rating("bullet")
-    def _find_rating(self, name):
+    def _find_rating(self, name) -> Union[int, None]:
        if self.stats_soup is None:
            return None
        anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
        if anchor is None:
            return None
        rating = anchor.find("rating")
        if rating is None:
            return None
        strong = rating.find("strong")
        if strong is None:
            return None
        value = strong.get_text()
        if value[-1] == "?":
            value = value[:-1]
        try:
            a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
            rating = a.find("rating")
            strong = rating.find("strong")
            value = strong.get_text()
            if value[-1] == "?":
                value = value[:-1]
            return int(value)
-        except ValueError:
+        except (AttributeError, ValueError):
            return None
--- a/sql/export.sql
+++ b/sql/export.sql
@ -7,31 +7,47 @@ DO $$
  END;
 $$ LANGUAGE plpgsql;
-CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
+-- This should match the order data is written in the app/__main__.py
 -- script.
 CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
  ( site TEXT
  , username TEXT
  , name TEXT
  , image_url TEXT
  , rapid TEXT
  , blitz TEXT
  , bullet TEXT
  );
 SELECT format(
-  $$COPY pg_temp.coach_scraper_export (data) from %L$$,
+  $$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
  :export
 ) \gexec
 INSERT INTO coach_scraper.export
-  ( username
+  ( site
-  , site
+  , username
  , name
  , image_url
  , rapid
  , blitz
  , bullet
  )
 SELECT
-  data->>'username',
+  site,
-  data->>'site',
+  username,
-  (data->>'rapid')::INT,
+  name,
-  (data->>'blitz')::INT,
+  image_url,
-  (data->>'bullet')::INT
+  rapid::INT,
  blitz::INT,
  bullet::INT
 FROM
  pg_temp.coach_scraper_export
 ON CONFLICT
  (site, username)
 DO UPDATE SET
  name = EXCLUDED.name,
  image_url = EXCLUDED.image_url,
  rapid = EXCLUDED.rapid,
  blitz = EXCLUDED.blitz,
  bullet = EXCLUDED.bullet;
--- a/sql/init.sql
+++ b/sql/init.sql
@ -1,9 +1,13 @@
 CREATE SCHEMA IF NOT EXISTS coach_scraper;
-CREATE TABLE IF NOT EXISTS coach_scraper.export
+DROP TABLE IF EXISTS coach_scraper.export;
 CREATE TABLE coach_scraper.export
  ( id SERIAL PRIMARY KEY
  , site VARCHAR(16) NOT NULL
  , username VARCHAR(255) NOT NULL
  , name VARCHAR(255)
  , image_url TEXT
  , rapid INT
  , blitz INT
  , bullet INT
@ -15,22 +19,3 @@ ON
  coach_scraper.export
 USING
  BTREE (site, username);
 DO $$
  BEGIN
    IF NOT EXISTS (
      SELECT 1
      FROM information_schema.constraint_column_usage 
      WHERE table_schema = 'coach_scraper'
      AND table_name = 'export'
      AND constraint_name = 'site_username_unique'
    ) THEN
      EXECUTE 'ALTER TABLE
        coach_scraper.export
      ADD CONSTRAINT
        site_username_unique
      UNIQUE USING INDEX
        site_username_unique';
    END IF;
  END;
 $$ LANGUAGE plpgsql;