From 63764a22c49a5240ca1d3347d9245c78f3f69612 Mon Sep 17 00:00:00 2001
From: Joshua Potter <jrpotter2112@gmail.com>
Date: Mon, 4 Dec 2023 15:08:17 -0700
Subject: [PATCH] Transition to a CSV; Postgres can handle that better.

---
 README.md       | 27 +++++++++-------
 app/__main__.py | 44 ++++++++++++++++----------
 app/chesscom.py | 35 ++++++++++++++++++++-
 app/exporter.py | 24 +++++++++++---
 app/lichess.py  | 84 +++++++++++++++++++++++++++++++------------------
 sql/export.sql  | 34 ++++++++++++++------
 sql/init.sql    | 25 +++------------
 7 files changed, 179 insertions(+), 94 deletions(-)
diff --git a/README.md b/README.md
index ee73333..6ba332f 100644
--- a/README.md
+++ b/README.md
@@ -29,17 +29,19 @@ data
     ├── ...
 ```
 
-## Usage
+## Quickstart
 
 If you have nix available, run:
 ```bash
-$ nix run . -- --user-agent <your-email> -s <site>
+$ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
 ```
 If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
 instead run the following:
 ```bash
-$ poetry run python3 -m app -u <your-email> -s <site>
+$ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
 ```
+After running (this may take several hours), a new CSV will be generated at
+`data/export.csv` containing all scraped content from the specified `<site>`s.
 
 ## Database
 
@@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
 ```bash
 $ psql -h @scraper -f sql/init.sql
 ```
-Next, concatenate all exported content and dump into the newly created table:
+Next, dump exported data into the newly created table:
 ```bash
-$ cat data/{chesscom,lichess}/export.json > data/export.json
-$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
+$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
+```
+Re-running the `sql/export.sql` script will create a backup of the
+`coach_scraper.export` table. It will then upsert the scraped data. You can view
+all backups from the `psql` console like so:
+```
+postgres=# \dt coach_scraper.export*
 ```
-Re-running will automatically create backups and replace the coach data found
-in `coach_scraper.export`.
 
 ### E2E
 
@@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
 the database in one fell swoop. Assuming our database is open with a socket
 connection available at `@scraper`:
 ```bash
-nix run . -- --user-agent <your-email> -s chesscom -s lichess
-cat data/{chesscom,lichess}/export.json > data/export.json
-psql -h @scraper -f sql/init.sql
-psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
+$ nix run . -- --user-agent <your-email> -s chesscom -s lichess
+$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
 ```
 
 ## Development
diff --git a/app/__main__.py b/app/__main__.py
index 45a5762..361adf1 100644
--- a/app/__main__.py
+++ b/app/__main__.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import csv
 import json
 
 import aiohttp
@@ -32,25 +33,34 @@ async def run():
     async with aiohttp.ClientSession(
         headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
     ) as session:
-        for site in set(args.site):
-            scraper, exporter_cls = None, None
+        with open("data/export.csv", "w") as f:
+            writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
+            for site in set(args.site):
+                scraper, exporter_cls = None, None
 
-            if site == Site.CHESSCOM.value:
-                scraper = ChesscomScraper(session)
-                exporter_cls = ChesscomExporter
-            elif site == Site.LICHESS.value:
-                scraper = LichessScraper(session)
-                exporter_cls = LichessExporter
+                if site == Site.CHESSCOM.value:
+                    scraper = ChesscomScraper(session)
+                    exporter_cls = ChesscomExporter
+                elif site == Site.LICHESS.value:
+                    scraper = LichessScraper(session)
+                    exporter_cls = LichessExporter
 
-            # Write out each coach data into NDJSON file.
-            dump = []
-            usernames = await scraper.scrape()
-            for username in usernames:
-                export = exporter_cls(username).export()
-                dump.append(f"{json.dumps(export)}\n")
-
-            with open(scraper.path_site_file("export.json"), "w") as f:
-                f.writelines(dump)
+                usernames = await scraper.scrape()
+                for username in usernames:
+                    export = exporter_cls(username).export()
+                    writer.writerow(
+                        [
+                            # This should match the order data is loaded in the
+                            # sql/export.sql script.
+                            export["site"],
+                            export["username"],
+                            export.get("name", ""),
+                            export.get("image_url", ""),
+                            export.get("rapid", ""),
+                            export.get("blitz", ""),
+                            export.get("bullet", ""),
+                        ]
+                    )
 
 
 def main():
diff --git a/app/chesscom.py b/app/chesscom.py
index 8209ba0..ae9ac30 100644
--- a/app/chesscom.py
+++ b/app/chesscom.py
@@ -5,7 +5,7 @@ import os.path
 from typing import List, Union
 
 import aiohttp
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SoupStrainer
 
 from app.exporter import BaseExporter
 from app.repo import AnsiColor, Site
@@ -172,10 +172,27 @@ class Scraper(BaseScraper):
         return True
 
 
+def _profile_filter(elem, attrs):
+    """Includes only relevant segments of the `{username}.html` file."""
+    if "profile-header-info" in attrs.get("class", ""):
+        return True
+    if "profile-card-info" in attrs.get("class", ""):
+        return True
+
+
 class Exporter(BaseExporter):
     def __init__(self, username: str):
         super().__init__(site=Site.CHESSCOM.value, username=username)
 
+        self.profile_soup = None
+        try:
+            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
+                self.profile_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
+                )
+        except FileNotFoundError:
+            pass
+
         self.stats_json = {}
         try:
             with open(self.path_coach_file(username, "stats.json"), "r") as f:
@@ -185,6 +202,22 @@ class Exporter(BaseExporter):
         except FileNotFoundError:
             pass
 
+    def export_name(self) -> Union[str, None]:
+        try:
+            name = self.profile_soup.find("div", class_="profile-card-name")
+            return name.get_text().strip()
+        except AttributeError:
+            return None
+
+    def export_image_url(self) -> Union[str, None]:
+        try:
+            div = self.profile_soup.find("div", class_="profile-header-avatar")
+            src = div.find("img").get("src", "")
+            if "images.chesscomfiles.com" in src:
+                return src
+        except AttributeError:
+            return None
+
     def export_rapid(self) -> Union[int, None]:
         return self.stats_json.get("rapid", {}).get("rating")
 
diff --git a/app/exporter.py b/app/exporter.py
index c882482..9e6d200 100644
--- a/app/exporter.py
+++ b/app/exporter.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Any, Union
 
 from typing_extensions import TypedDict
 
@@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo
 
 
 class Export(TypedDict, total=False):
+    # The (site, username) make up a unique key for each coach.
+    site: str
+    username: str
+    # The coach's real name.
+    name: str
+    # The profile image used on the source site.
+    image_url: str
     # The coach's rapid rating relative to the site they were sourced from.
     rapid: int
     # The coach's blitz rating relative to the site they were sourced from.
@@ -14,10 +21,9 @@ class Export(TypedDict, total=False):
     bullet: int
 
 
-def _insert(export: Export, key: str, value: any):
-    if value is None:
-        return
-    export[key] = value
+def _insert(export: Export, key: str, value: Any):
+    if value is not None:
+        export[key] = value
 
 
 class BaseExporter(Repo):
@@ -25,6 +31,12 @@ class BaseExporter(Repo):
         super().__init__(site)
         self.username = username
 
+    def export_name(self) -> Union[str, None]:
+        raise NotImplementedError()
+
+    def export_image_url(self) -> Union[str, None]:
+        raise NotImplementedError()
+
     def export_rapid(self) -> Union[int, None]:
         raise NotImplementedError()
 
@@ -40,6 +52,8 @@ class BaseExporter(Repo):
 
         _insert(export, "site", self.site)
         _insert(export, "username", self.username)
+        _insert(export, "name", self.export_name())
+        _insert(export, "image_url", self.export_image_url())
         _insert(export, "rapid", self.export_rapid())
         _insert(export, "blitz", self.export_blitz())
         _insert(export, "bullet", self.export_bullet())
diff --git a/app/lichess.py b/app/lichess.py
index b2af7b7..22cb86d 100644
--- a/app/lichess.py
+++ b/app/lichess.py
@@ -1,7 +1,7 @@
 import asyncio
 import os
 import os.path
-from typing import List
+from typing import List, Union
 
 import aiohttp
 from bs4 import BeautifulSoup, SoupStrainer
@@ -97,9 +97,9 @@ class Scraper(BaseScraper):
         soup = BeautifulSoup(response, "lxml")
         members = soup.find_all("article", class_="coach-widget")
         for member in members:
-            anchor = member.find("a", class_="overlay")
-            if anchor:
-                href = anchor.get("href")
+            a = member.find("a", class_="overlay")
+            if a:
+                href = a.get("href")
                 username = href[len("/coach/") :]
                 usernames.append(username)
 
@@ -163,8 +163,16 @@ class Scraper(BaseScraper):
         return True
 
 
+def _profile_filter(elem, attrs):
+    """Includes only relevant segments of the `{username}.html` file."""
+    if "coach-widget" in attrs.get("class", ""):
+        return True
+
+
 def _stats_filter(elem, attrs):
     """Includes only relevant segments of the `stats.html` file."""
+    if "profile-side" in attrs.get("class", ""):
+        return True
     if "sub-ratings" in attrs.get("class", ""):
         return True
 
@@ -173,43 +181,59 @@ class Exporter(BaseExporter):
     def __init__(self, username: str):
         super().__init__(site=Site.LICHESS.value, username=username)
 
-        self.stats_soup = None
+        self.profile_soup = None
         try:
-            with open(self.path_coach_file(username, "stats.html"), "r") as f:
-                stats_strainer = SoupStrainer(_stats_filter)
-                self.stats_soup = BeautifulSoup(
-                    f.read(), "lxml", parse_only=stats_strainer
+            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
+                self.profile_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                 )
         except FileNotFoundError:
             pass
 
-    def export_rapid(self):
+        self.stats_soup = None
+        try:
+            with open(self.path_coach_file(username, "stats.html"), "r") as f:
+                self.stats_soup = BeautifulSoup(
+                    f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
+                )
+        except FileNotFoundError:
+            pass
+
+    def export_name(self) -> Union[str, None]:
+        try:
+            profile_side = self.stats_soup.find("div", class_="profile-side")
+            user_infos = profile_side.find("div", class_="user-infos")
+            name = user_infos.find("strong", class_="name")
+            return name.get_text().strip()
+        except AttributeError:
+            return None
+
+    def export_image_url(self) -> Union[str, None]:
+        try:
+            picture = self.profile_soup.find("img", class_="picture")
+            src = picture.get("src", "")
+            if "image.lichess1.org" in src:
+                return src
+        except AttributeError:
+            return None
+
+    def export_rapid(self) -> Union[int, None]:
         return self._find_rating("rapid")
 
-    def export_blitz(self):
+    def export_blitz(self) -> Union[int, None]:
         return self._find_rating("blitz")
 
-    def export_bullet(self):
+    def export_bullet(self) -> Union[int, None]:
         return self._find_rating("bullet")
 
-    def _find_rating(self, name):
-        if self.stats_soup is None:
-            return None
-
-        anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
-        if anchor is None:
-            return None
-        rating = anchor.find("rating")
-        if rating is None:
-            return None
-        strong = rating.find("strong")
-        if strong is None:
-            return None
-        value = strong.get_text()
-        if value[-1] == "?":
-            value = value[:-1]
-
+    def _find_rating(self, name) -> Union[int, None]:
         try:
+            a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
+            rating = a.find("rating")
+            strong = rating.find("strong")
+            value = strong.get_text()
+            if value[-1] == "?":
+                value = value[:-1]
             return int(value)
-        except ValueError:
+        except (AttributeError, ValueError):
             return None
diff --git a/sql/export.sql b/sql/export.sql
index b5079e0..be0aec0 100644
--- a/sql/export.sql
+++ b/sql/export.sql
@@ -7,31 +7,47 @@ DO $$
   END;
 $$ LANGUAGE plpgsql;
 
-CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
+-- This should match the order data is written in the app/__main__.py
+-- script.
+CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
+  ( site TEXT
+  , username TEXT
+  , name TEXT
+  , image_url TEXT
+  , rapid TEXT
+  , blitz TEXT
+  , bullet TEXT
+  );
 
 SELECT format(
-  $$COPY pg_temp.coach_scraper_export (data) from %L$$,
+  $$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
   :export
 ) \gexec
 
 INSERT INTO coach_scraper.export
-  ( username
-  , site
+  ( site
+  , username
+  , name
+  , image_url
   , rapid
   , blitz
   , bullet
   )
 SELECT
-  data->>'username',
-  data->>'site',
-  (data->>'rapid')::INT,
-  (data->>'blitz')::INT,
-  (data->>'bullet')::INT
+  site,
+  username,
+  name,
+  image_url,
+  rapid::INT,
+  blitz::INT,
+  bullet::INT
 FROM
   pg_temp.coach_scraper_export
 ON CONFLICT
   (site, username)
 DO UPDATE SET
+  name = EXCLUDED.name,
+  image_url = EXCLUDED.image_url,
   rapid = EXCLUDED.rapid,
   blitz = EXCLUDED.blitz,
   bullet = EXCLUDED.bullet;
diff --git a/sql/init.sql b/sql/init.sql
index 8d19d93..ccefcde 100644
--- a/sql/init.sql
+++ b/sql/init.sql
@@ -1,9 +1,13 @@
 CREATE SCHEMA IF NOT EXISTS coach_scraper;
 
-CREATE TABLE IF NOT EXISTS coach_scraper.export
+DROP TABLE IF EXISTS coach_scraper.export;
+
+CREATE TABLE coach_scraper.export
   ( id SERIAL PRIMARY KEY
   , site VARCHAR(16) NOT NULL
   , username VARCHAR(255) NOT NULL
+  , name VARCHAR(255)
+  , image_url TEXT
   , rapid INT
   , blitz INT
   , bullet INT
@@ -15,22 +19,3 @@ ON
   coach_scraper.export
 USING
   BTREE (site, username);
-
-DO $$
-  BEGIN
-    IF NOT EXISTS (
-      SELECT 1
-      FROM information_schema.constraint_column_usage 
-      WHERE table_schema = 'coach_scraper'
-      AND table_name = 'export'
-      AND constraint_name = 'site_username_unique'
-    ) THEN
-      EXECUTE 'ALTER TABLE
-        coach_scraper.export
-      ADD CONSTRAINT
-        site_username_unique
-      UNIQUE USING INDEX
-        site_username_unique';
-    END IF;
-  END;
-$$ LANGUAGE plpgsql;