From e050d13aa7082e068d454e48d3f4eb7fa661eb24 Mon Sep 17 00:00:00 2001
From: Joshua Potter <jrpotter2112@gmail.com>
Date: Thu, 30 Nov 2023 17:12:16 -0700
Subject: [PATCH] Add class for wrapping around exports.

---
 app/__main__.py | 26 ++++++++++++++----
 app/chesscom.py | 27 +++++++++---------
 app/exporter.py | 34 +++++++++++++++++++++++
 app/lichess.py  | 32 +++++++++++++++++-----
 app/repo.py     | 61 +++++++++++++++++++++++++++++++++++++++++
 app/scraper.py  | 73 ++++---------------------------------------------
 6 files changed, 160 insertions(+), 93 deletions(-)
 create mode 100644 app/exporter.py
 create mode 100644 app/repo.py

diff --git a/app/__main__.py b/app/__main__.py
index f6f9a19..ec45335 100644
--- a/app/__main__.py
+++ b/app/__main__.py
@@ -1,16 +1,23 @@
 import aiohttp
 import argparse
 import asyncio
+import json
 
-from app.chesscom import Scraper as ChesscomScraper
-from app.lichess import Scraper as LichessScraper
-from app.scraper import Site
+from app.chesscom import (
+    Exporter as ChesscomExporter,
+    Scraper as ChesscomScraper,
+)
+from app.lichess import (
+    Exporter as LichessExporter,
+    Scraper as LichessScraper,
+)
+from app.repo import Site
 
 
 async def run():
     parser = argparse.ArgumentParser(
         prog="coach-scraper",
-        description="HTML scraping of chess.com coaches.",
+        description="Scraping/exporting of chess coaches.",
     )
     parser.add_argument("-u", "--user-agent", required=True)
     parser.add_argument(
@@ -29,10 +36,19 @@ async def run():
     ) as session:
         if args.site == Site.CHESSCOM.value:
             scraper = ChesscomScraper(session)
+            exporter_cls = ChesscomExporter
         elif args.site == Site.LICHESS.value:
             scraper = LichessScraper(session)
+            exporter_cls = LichessExporter
 
-        await scraper.scrape()
+        dump = {}
+
+        usernames = await scraper.scrape()
+        for username in usernames:
+            dump[username] = exporter_cls(username).export()
+
+        with open(scraper.path_site_file("export.json"), "w") as f:
+            json.dump(dump, f, indent=2)
 
 
 def main():
diff --git a/app/chesscom.py b/app/chesscom.py
index 9b24b95..2cec3d3 100644
--- a/app/chesscom.py
+++ b/app/chesscom.py
@@ -4,9 +4,11 @@ import json
 import os
 import os.path
 
-from app.scraper import AnsiColor, BaseScraper, Export, Site
+from app.repo import AnsiColor, Site
+from app.exporter import BaseExporter
+from app.scraper import BaseScraper
 from bs4 import BeautifulSoup
-from typing import List
+from typing import List, Union
 
 
 # The number of coach listing pages we will at most iterate through. This number
@@ -169,22 +171,19 @@ class Scraper(BaseScraper):
 
         return True
 
-    def _load_stats_json(self, stats: dict) -> Export:
-        """Extract relevant fields from a `stats.json` file."""
-        export: Export = {}
-        for stat in stats.get("stats", []):
-            if stat["key"] == "rapid":
-                export["fide_rapid"] = stat["stats"]["rating"]
-        return export
 
-    async def export(self, username: str) -> Export:
-        """Transform coach-specific data into uniform format."""
-        export: Export = {}
+class Exporter(BaseExporter):
+    def __init__(self, username: str):
+        super().__init__(site=Site.CHESSCOM.value, username=username)
 
+        self.stats_json = {}
         try:
             with open(self.path_coach_file(username, "stats.json"), "r") as f:
-                export.update(self._load_stats_json(json.load(f)))
+                for s in json.load(f).get("stats", []):
+                    if "key" in s and "stats" in s:
+                        self.stats_json[s["key"]] = s["stats"]
         except FileNotFoundError:
             pass
 
-        return export
+    def export_fide_rapid(self) -> Union[int, None]:
+        return self.stats_json.get("rapid", {}).get("rating")
diff --git a/app/exporter.py b/app/exporter.py
new file mode 100644
index 0000000..0dbbb1a
--- /dev/null
+++ b/app/exporter.py
@@ -0,0 +1,34 @@
+from app.repo import AnsiColor, Repo
+from typing import Union
+from typing_extensions import TypedDict
+
+
+class Export(TypedDict, total=False):
+    fide_rapid: int
+
+
+class BaseExporter(Repo):
+    def __init__(self, site: str, username: str):
+        super().__init__(site)
+        self.username = username
+
+    def export_fide_rapid(self) -> Union[int, None]:
+        raise NotImplementedError()
+
+    def export(self) -> Export:
+        """Transform coach-specific data into uniform format."""
+        export: Export = {}
+
+        fide_rapid = self.export_fide_rapid()
+        if fide_rapid:
+            export["fide_rapid"] = fide_rapid
+
+        self.log(
+            [
+                (AnsiColor.INFO, "[INFO]"),
+                (None, ": Exported "),
+                (AnsiColor.DATA, self.username),
+            ]
+        )
+
+        return export
diff --git a/app/lichess.py b/app/lichess.py
index 39eebf8..a7d1ec5 100644
--- a/app/lichess.py
+++ b/app/lichess.py
@@ -3,7 +3,9 @@ import asyncio
 import os
 import os.path
 
-from app.scraper import AnsiColor, BaseScraper, Export, Site
+from app.repo import AnsiColor, Site
+from app.scraper import BaseScraper
+from app.exporter import BaseExporter
 from bs4 import BeautifulSoup
 from typing import List
 
@@ -111,7 +113,14 @@ class Scraper(BaseScraper):
         """
         filepath = self.path_coach_file(username, f"{username}.html")
         if os.path.isfile(filepath):
-            return False
+            self.log(
+                [
+                    (AnsiColor.INFO, "[INFO]"),
+                    (None, ": Skipping download for coach "),
+                    (AnsiColor.DATA, username),
+                ]
+            )
+            return
 
         response, _unused_status = await self.request(
             url=f"https://lichess.org/coach/{username}"
@@ -120,9 +129,18 @@ class Scraper(BaseScraper):
             with open(filepath, "w") as f:
                 f.write(response)
 
-        return True
+        self.log(
+            [
+                (AnsiColor.INFO, "[INFO]"),
+                (None, ": Downloaded data for coach "),
+                (AnsiColor.DATA, username),
+            ]
+        )
 
-    async def export(self, username: str) -> Export:
-        """Transform coach-specific data into uniform format."""
-        export: Export = {}
-        return export
+
+class Exporter(BaseExporter):
+    def __init__(self, username: str):
+        super().__init__(site=Site.LICHESS.value, username=username)
+
+    def export_fide_rapid(self):
+        return None
diff --git a/app/repo.py b/app/repo.py
new file mode 100644
index 0000000..abb01ef
--- /dev/null
+++ b/app/repo.py
@@ -0,0 +1,61 @@
+import enum
+import os
+
+from typing import List, Tuple, Union
+
+
+class AnsiColor(enum.Enum):
+    ERROR = "\033[0;31m"
+    INFO = "\033[0;34m"
+    DATA = "\033[0;36m"
+    RESET = "\033[0m"
+
+
+class Site(enum.Enum):
+    CHESSCOM = "chesscom"
+    LICHESS = "lichess"
+
+
+class Repo:
+    """Shared filesystem-related functionality."""
+
+    def __init__(self, site: str):
+        self.site = site
+
+    def path_site_dir(self):
+        """The root directory for all site-related files."""
+        return os.path.join("data", self.site)
+
+    def path_site_file(self, filename: str):
+        """Path to a top-level site-related file."""
+        return os.path.join(self.path_site_dir(), filename)
+
+    def path_coaches_dir(self):
+        """The root directory for all coach-related downloads."""
+        return os.path.join(self.path_site_dir(), "coaches")
+
+    def path_coach_dir(self, username: str):
+        """The root directory for a specific coach's downloads."""
+        return os.path.join(self.path_coaches_dir(), username)
+
+    def path_coach_file(self, username: str, filename: str):
+        """Path to a coach-specific file download."""
+        return os.path.join(self.path_coach_dir(username), filename)
+
+    def path_pages_dir(self):
+        """The root directory for all username listing files."""
+        return os.path.join(self.path_site_dir(), "pages")
+
+    def path_page_file(self, page_no: int):
+        """The root directory for usernames scraped from a single page."""
+        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
+
+    def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
+        transformed = []
+        for k, v in msgs:
+            if k is None:
+                transformed.append(v)
+            else:
+                transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
+
+        print("".join(transformed))
diff --git a/app/scraper.py b/app/scraper.py
index 7d7727b..c19af41 100644
--- a/app/scraper.py
+++ b/app/scraper.py
@@ -1,38 +1,20 @@
 import aiohttp
-import enum
-import json
 import os
 
+from app.repo import Repo
 from typing import List, Tuple, Union
-from typing_extensions import TypedDict
 
 
-class Site(enum.Enum):
-    CHESSCOM = "chesscom"
-    LICHESS = "lichess"
-
-
-class AnsiColor(enum.Enum):
-    ERROR = "\033[0;31m"
-    INFO = "\033[0;34m"
-    DATA = "\033[0;36m"
-    RESET = "\033[0m"
-
-
-class Export(TypedDict, total=False):
-    fide_rapid: Union[int, None]
-
-
-class BaseScraper:
+class BaseScraper(Repo):
     def __init__(self, site: str, session: aiohttp.ClientSession):
-        """Initialize a new web scraper and exporter.
+        """Initialize a new web scraper.
 
         @param site:
             The site we are making requests out to.
         @param session:
             The `aiohttp.ClientSession` context our requests are made from.
         """
-        self.site = site
+        super().__init__(site)
         self.session = session
 
     async def download_usernames(self) -> List[str]:
@@ -43,10 +25,6 @@ class BaseScraper:
         """For each coach, download coach-specific data."""
         raise NotImplementedError()
 
-    async def export(self, username: str) -> Export:
-        """Transform coach-specific data into uniform format."""
-        raise NotImplementedError()
-
     async def request(self, url: str) -> Tuple[Union[str, None], int]:
         """Make network requests using the internal session.
 
@@ -61,7 +39,7 @@ class BaseScraper:
                 return await response.text(), 200
         return None, response.status
 
-    async def scrape(self):
+    async def scrape(self) -> List[str]:
         """Main entrypoint for scraping and exporting downloaded content.
 
         A `Scraper` is structured to operates in the following stages:
@@ -77,43 +55,4 @@ class BaseScraper:
             os.makedirs(self.path_coach_dir(username), exist_ok=True)
             await self.download_profile(username)
 
-            export = await self.export(username)
-            with open(self.path_coach_file(username, "export.json"), "w") as f:
-                json.dump(export, f)
-            self.log(
-                [
-                    (AnsiColor.INFO, "[INFO]"),
-                    (None, ": Finished exporting "),
-                    (AnsiColor.DATA, username),
-                ]
-            )
-
-    def path_coaches_dir(self):
-        """The root directory for all coach-related downloads."""
-        return os.path.join("data", self.site, "coaches")
-
-    def path_coach_dir(self, username: str):
-        """The root directory for a specific coach's downloads."""
-        return os.path.join(self.path_coaches_dir(), username)
-
-    def path_coach_file(self, username: str, filename: str):
-        """Path to a coach-specific file download."""
-        return os.path.join(self.path_coach_dir(username), filename)
-
-    def path_pages_dir(self):
-        """The root directory for all username listing files."""
-        return os.path.join("data", self.site, "pages")
-
-    def path_page_file(self, page_no: int):
-        """The root directory for usernames scraped from a single page."""
-        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
-
-    def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
-        transformed = []
-        for k, v in msgs:
-            if k is None:
-                transformed.append(v)
-            else:
-                transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
-
-        print("".join(transformed))
+        return usernames