Transition to a CSV; Postgres can handle that better.

2023-12-04 15:08:17 -07:00 · 2023-12-04 15:08:17 -07:00 · 63764a22c4
parent ec94a16140
commit 63764a22c4
7 changed files with 179 additions and 94 deletions
--- a/README.md
+++ b/README.md
@ -29,17 +29,19 @@ data
    ├── ...
 ```

-## Usage
+## Quickstart

 If you have nix available, run:
 ```bash
-$ nix run . -- --user-agent <your-email> -s <site>
+$ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
 ```
 If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
 instead run the following:
 ```bash
-$ poetry run python3 -m app -u <your-email> -s <site>
+$ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
 ```
+After running (this may take several hours), a new CSV will be generated at
+`data/export.csv` containing all scraped content from the specified `<site>`s.

 ## Database

@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
 ```bash
 $ psql -h @scraper -f sql/init.sql
 ```
-Next, concatenate all exported content and dump into the newly created table:
+Next, dump exported data into the newly created table:
 ```bash
-$ cat data/{chesscom,lichess}/export.json > data/export.json
-$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
+$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
+```
+Re-running the `sql/export.sql` script will create a backup of the
+`coach_scraper.export` table. It will then upsert the scraped data. You can view
+all backups from the `psql` console like so:
+```
+postgres=# \dt coach_scraper.export*
 ```
-Re-running will automatically create backups and replace the coach data found
-in `coach_scraper.export`.

 ### E2E

@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
 the database in one fell swoop. Assuming our database is open with a socket
 connection available at `@scraper`:
 ```bash
-nix run . -- --user-agent <your-email> -s chesscom -s lichess
-cat data/{chesscom,lichess}/export.json > data/export.json
-psql -h @scraper -f sql/init.sql
-psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
+$ nix run . -- --user-agent <your-email> -s chesscom -s lichess
+$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
 ```

 ## Development
--- a/app/main.py
+++ b/app/main.py
@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import csv
 import json

 import aiohttp
@ -32,6 +33,8 @@ async def run():
    async with aiohttp.ClientSession(
        headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
    ) as session:
+        with open("data/export.csv", "w") as f:
+            writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
            for site in set(args.site):
                scraper, exporter_cls = None, None

@ -42,15 +45,22 @@ async def run():
                    scraper = LichessScraper(session)
                    exporter_cls = LichessExporter

-            # Write out each coach data into NDJSON file.
-            dump = []
                usernames = await scraper.scrape()
                for username in usernames:
                    export = exporter_cls(username).export()
-                dump.append(f"{json.dumps(export)}\n")
-
-            with open(scraper.path_site_file("export.json"), "w") as f:
-                f.writelines(dump)
+                    writer.writerow(
+                        [
+                            # This should match the order data is loaded in the
+                            # sql/export.sql script.
+                            export["site"],
+                            export["username"],
+                            export.get("name", ""),
+                            export.get("image_url", ""),
+                            export.get("rapid", ""),
+                            export.get("blitz", ""),
+                            export.get("bullet", ""),
+                        ]
+                    )


 def main():
--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -5,7 +5,7 @@ import os.path
 from typing import List, Union

 import aiohttp
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer

 from app.exporter import BaseExporter
 from app.repo import AnsiColor, Site
@ -172,10 +172,27 @@ class Scraper(BaseScraper):
        return True


+def _profile_filter(elem, attrs):
+    """Includes only relevant segments of the `{username}.html` file."""
+    if "profile-header-info" in attrs.get("class", ""):
+        return True
+    if "profile-card-info" in attrs.get("class", ""):
+        return True
+
+
 class Exporter(BaseExporter):
    def __init__(self, username: str):
        super().__init__(site=Site.CHESSCOM.value, username=username)

+        self.profile_soup = None
+        try:
+            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
+                self.profile_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
+                )
+        except FileNotFoundError:
+            pass
+
        self.stats_json = {}
        try:
            with open(self.path_coach_file(username, "stats.json"), "r") as f:
@ -185,6 +202,22 @@ class Exporter(BaseExporter):
        except FileNotFoundError:
            pass

+    def export_name(self) -> Union[str, None]:
+        try:
+            name = self.profile_soup.find("div", class_="profile-card-name")
+            return name.get_text().strip()
+        except AttributeError:
+            return None
+
+    def export_image_url(self) -> Union[str, None]:
+        try:
+            div = self.profile_soup.find("div", class_="profile-header-avatar")
+            src = div.find("img").get("src", "")
+            if "images.chesscomfiles.com" in src:
+                return src
+        except AttributeError:
+            return None
+
    def export_rapid(self) -> Union[int, None]:
        return self.stats_json.get("rapid", {}).get("rating")

--- a/app/exporter.py
+++ b/app/exporter.py
@ -1,4 +1,4 @@
-from typing import Union
+from typing import Any, Union

 from typing_extensions import TypedDict

@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo


 class Export(TypedDict, total=False):
+    # The (site, username) make up a unique key for each coach.
+    site: str
+    username: str
+    # The coach's real name.
+    name: str
+    # The profile image used on the source site.
+    image_url: str
    # The coach's rapid rating relative to the site they were sourced from.
    rapid: int
    # The coach's blitz rating relative to the site they were sourced from.
@ -14,9 +21,8 @@ class Export(TypedDict, total=False):
    bullet: int


-def _insert(export: Export, key: str, value: any):
-    if value is None:
-        return
+def _insert(export: Export, key: str, value: Any):
+    if value is not None:
        export[key] = value


@ -25,6 +31,12 @@ class BaseExporter(Repo):
        super().__init__(site)
        self.username = username

+    def export_name(self) -> Union[str, None]:
+        raise NotImplementedError()
+
+    def export_image_url(self) -> Union[str, None]:
+        raise NotImplementedError()
+
    def export_rapid(self) -> Union[int, None]:
        raise NotImplementedError()

@ -40,6 +52,8 @@ class BaseExporter(Repo):

        _insert(export, "site", self.site)
        _insert(export, "username", self.username)
+        _insert(export, "name", self.export_name())
+        _insert(export, "image_url", self.export_image_url())
        _insert(export, "rapid", self.export_rapid())
        _insert(export, "blitz", self.export_blitz())
        _insert(export, "bullet", self.export_bullet())
--- a/app/lichess.py
+++ b/app/lichess.py
@ -1,7 +1,7 @@
 import asyncio
 import os
 import os.path
-from typing import List
+from typing import List, Union

 import aiohttp
 from bs4 import BeautifulSoup, SoupStrainer
@ -97,9 +97,9 @@ class Scraper(BaseScraper):
        soup = BeautifulSoup(response, "lxml")
        members = soup.find_all("article", class_="coach-widget")
        for member in members:
-            anchor = member.find("a", class_="overlay")
-            if anchor:
-                href = anchor.get("href")
+            a = member.find("a", class_="overlay")
+            if a:
+                href = a.get("href")
                username = href[len("/coach/") :]
                usernames.append(username)

@ -163,8 +163,16 @@ class Scraper(BaseScraper):
        return True


+def _profile_filter(elem, attrs):
+    """Includes only relevant segments of the `{username}.html` file."""
+    if "coach-widget" in attrs.get("class", ""):
+        return True
+
+
 def _stats_filter(elem, attrs):
    """Includes only relevant segments of the `stats.html` file."""
+    if "profile-side" in attrs.get("class", ""):
+        return True
    if "sub-ratings" in attrs.get("class", ""):
        return True

@ -173,43 +181,59 @@ class Exporter(BaseExporter):
    def __init__(self, username: str):
        super().__init__(site=Site.LICHESS.value, username=username)

-        self.stats_soup = None
+        self.profile_soup = None
        try:
-            with open(self.path_coach_file(username, "stats.html"), "r") as f:
-                stats_strainer = SoupStrainer(_stats_filter)
-                self.stats_soup = BeautifulSoup(
-                    f.read(), "lxml", parse_only=stats_strainer
+            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
+                self.profile_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                )
        except FileNotFoundError:
            pass

-    def export_rapid(self):
+        self.stats_soup = None
+        try:
+            with open(self.path_coach_file(username, "stats.html"), "r") as f:
+                self.stats_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
+                )
+        except FileNotFoundError:
+            pass
+
+    def export_name(self) -> Union[str, None]:
+        try:
+            profile_side = self.stats_soup.find("div", class_="profile-side")
+            user_infos = profile_side.find("div", class_="user-infos")
+            name = user_infos.find("strong", class_="name")
+            return name.get_text().strip()
+        except AttributeError:
+            return None
+
+    def export_image_url(self) -> Union[str, None]:
+        try:
+            picture = self.profile_soup.find("img", class_="picture")
+            src = picture.get("src", "")
+            if "image.lichess1.org" in src:
+                return src
+        except AttributeError:
+            return None
+
+    def export_rapid(self) -> Union[int, None]:
        return self._find_rating("rapid")

-    def export_blitz(self):
+    def export_blitz(self) -> Union[int, None]:
        return self._find_rating("blitz")

-    def export_bullet(self):
+    def export_bullet(self) -> Union[int, None]:
        return self._find_rating("bullet")

-    def _find_rating(self, name):
-        if self.stats_soup is None:
-            return None
-
-        anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
-        if anchor is None:
-            return None
-        rating = anchor.find("rating")
-        if rating is None:
-            return None
+    def _find_rating(self, name) -> Union[int, None]:
+        try:
+            a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
+            rating = a.find("rating")
            strong = rating.find("strong")
-        if strong is None:
-            return None
            value = strong.get_text()
            if value[-1] == "?":
                value = value[:-1]
-
-        try:
            return int(value)
-        except ValueError:
+        except (AttributeError, ValueError):
            return None
--- a/sql/export.sql
+++ b/sql/export.sql
@ -7,31 +7,47 @@ DO $$
  END;
 $$ LANGUAGE plpgsql;

-CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
+-- This should match the order data is written in the app/__main__.py
+-- script.
+CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
+  ( site TEXT
+  , username TEXT
+  , name TEXT
+  , image_url TEXT
+  , rapid TEXT
+  , blitz TEXT
+  , bullet TEXT
+  );

 SELECT format(
-  $$COPY pg_temp.coach_scraper_export (data) from %L$$,
+  $$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
  :export
 ) \gexec

 INSERT INTO coach_scraper.export
-  ( username
-  , site
+  ( site
+  , username
+  , name
+  , image_url
  , rapid
  , blitz
  , bullet
  )
 SELECT
-  data->>'username',
-  data->>'site',
-  (data->>'rapid')::INT,
-  (data->>'blitz')::INT,
-  (data->>'bullet')::INT
+  site,
+  username,
+  name,
+  image_url,
+  rapid::INT,
+  blitz::INT,
+  bullet::INT
 FROM
  pg_temp.coach_scraper_export
 ON CONFLICT
  (site, username)
 DO UPDATE SET
+  name = EXCLUDED.name,
+  image_url = EXCLUDED.image_url,
  rapid = EXCLUDED.rapid,
  blitz = EXCLUDED.blitz,
  bullet = EXCLUDED.bullet;
--- a/sql/init.sql
+++ b/sql/init.sql
@ -1,9 +1,13 @@
 CREATE SCHEMA IF NOT EXISTS coach_scraper;

-CREATE TABLE IF NOT EXISTS coach_scraper.export
+DROP TABLE IF EXISTS coach_scraper.export;
+
+CREATE TABLE coach_scraper.export
  ( id SERIAL PRIMARY KEY
  , site VARCHAR(16) NOT NULL
  , username VARCHAR(255) NOT NULL
+  , name VARCHAR(255)
+  , image_url TEXT
  , rapid INT
  , blitz INT
  , bullet INT
@ -15,22 +19,3 @@ ON
  coach_scraper.export
 USING
  BTREE (site, username);
-
-DO $$
-  BEGIN
-    IF NOT EXISTS (
-      SELECT 1
-      FROM information_schema.constraint_column_usage 
-      WHERE table_schema = 'coach_scraper'
-      AND table_name = 'export'
-      AND constraint_name = 'site_username_unique'
-    ) THEN
-      EXECUTE 'ALTER TABLE
-        coach_scraper.export
-      ADD CONSTRAINT
-        site_username_unique
-      UNIQUE USING INDEX
-        site_username_unique';
-    END IF;
-  END;
-$$ LANGUAGE plpgsql;