Generalize in anticipation of merging the lichess scraper. (#1)

* Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works.
2023-11-30 15:15:15 -07:00 · 2023-11-30 15:15:15 -07:00 · 10801b560c
parent 3cc31f8f24
commit 10801b560c
6 changed files with 337 additions and 181 deletions
--- a/README.md
+++ b/README.md
@ -29,15 +29,12 @@ data

 If you have nix available, run:
 ```bash
-$> nix build
-$> result/bin/app --user-agent <your-email>
+$> nix run . -- --user-agent <your-email> -s chesscom
 ```
 If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
 instead run the following:
 ```bash
-$> poetry install
-$> source $(poetry env info --path)/bin/activate
-$> python3 -m app
+$> poetry run python3 -m app -u <your-email> -s chesscom
 ```

 ## Development
--- a/app/main.py
+++ b/app/main.py
@ -1,4 +1,39 @@
-from app import scraper
+import aiohttp
+import argparse
+import asyncio
+
+from app.chesscom import Scraper as ChesscomScraper
+from app.scraper import Site
+
+
+async def run():
+    parser = argparse.ArgumentParser(
+        prog="coach-scraper",
+        description="HTML scraping of chess.com coaches.",
+    )
+    parser.add_argument("-u", "--user-agent", required=True)
+    parser.add_argument(
+        "-s",
+        "--site",
+        required=True,
+        choices=[
+            Site.CHESSCOM.value,
+        ],
+    )
+    args = parser.parse_args()
+
+    async with aiohttp.ClientSession(
+        headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
+    ) as session:
+        if args.site == Site.CHESSCOM.value:
+            scraper = ChesscomScraper(session)
+
+        await scraper.scrape()
+
+
+def main():
+    asyncio.run(run())
+

 if __name__ == "__main__":
-    scraper.run()
+    main()
--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -0,0 +1,193 @@
+import aiohttp
+import asyncio
+import json
+import os
+import os.path
+
+from app.scraper import AnsiColor, BaseScraper, Export, Site
+from bs4 import BeautifulSoup
+from typing import List
+
+
+# The number of coach listing pages we will at most iterate through. This number
+# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
+# traversing to the last page.
+MAX_PAGES = 64
+
+# How long to wait between a batch of network requests.
+SLEEP_SECS = 3
+
+
+class Scraper(BaseScraper):
+    def __init__(self, session: aiohttp.ClientSession):
+        super().__init__(site=Site.CHESSCOM.value, session=session)
+
+    async def download_usernames(self) -> List[str]:
+        """Scan through chess.com/coaches for all coaches' usernames.
+
+        @return
+            The complete list of scraped usernames across every coach listing
+            page.
+        """
+        usernames = []
+        for page_no in range(1, MAX_PAGES + 1):
+            filepath = self.path_page_file(page_no)
+            try:
+                with open(filepath, "r") as f:
+                    self.log(
+                        [
+                            (AnsiColor.INFO, "[INFO]"),
+                            (None, ": Reading file "),
+                            (AnsiColor.DATA, filepath),
+                        ]
+                    )
+                    usernames.extend([line.strip() for line in f.readlines()])
+            except FileNotFoundError:
+                page_usernames = await self._scrape_page(page_no)
+                if not page_usernames:
+                    self.log(
+                        [
+                            (AnsiColor.ERROR, "[ERROR]"),
+                            (None, ": Could not scrape page "),
+                            (AnsiColor.DATA, str(page_no)),
+                        ]
+                    )
+                    continue
+                with open(filepath, "w") as f:
+                    for username in page_usernames:
+                        f.write(f"{username}\n")
+                usernames.extend(page_usernames)
+                self.log(
+                    [
+                        (AnsiColor.INFO, "[INFO]"),
+                        (None, ": Downloaded page "),
+                        (AnsiColor.DATA, filepath),
+                    ]
+                )
+                await asyncio.sleep(SLEEP_SECS)
+
+        return usernames
+
+    async def _scrape_page(self, page_no: int) -> List[str]:
+        """Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
+
+        @param page_no
+            The page consisting of at most 25 coaches (at the time of writing)
+            whose usernames are to be scraped.
+        @return
+            The list of scraped usernames on the specified coach listing page.
+        """
+        url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
+        response, status_code = await self.request(url)
+        if response is None:
+            self.log(
+                [
+                    (AnsiColor.ERROR, "[ERROR]"),
+                    (None, ": Received status "),
+                    (AnsiColor.DATA, f"{status_code} "),
+                    (None, "when downloading page "),
+                    (AnsiColor.DATA, str(page_no)),
+                ]
+            )
+            return
+
+        usernames = []
+        soup = BeautifulSoup(response, "html.parser")
+        members = soup.find_all("a", class_="members-categories-username")
+        for member in members:
+            href = member.get("href")
+            username = href[len("https://www.chess.com/member/") :]
+            usernames.append(username)
+
+        return usernames
+
+    async def download_profile(self, username: str):
+        """For each coach, download coach-specific data.
+
+        This sends three parallel requests for:
+        * the coach's profile,
+        * the coach's recent activity,
+        * the coach's stats.
+
+        @param username
+            The coach username corresponding to the downloaded files.
+        """
+        used_network = await asyncio.gather(
+            self._download_profile_file(
+                url=f"https://www.chess.com/member/{username}",
+                username=username,
+                filename=self.path_coach_file(username, f"{username}.html"),
+            ),
+            self._download_profile_file(
+                url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
+                username=username,
+                filename=self.path_coach_file(username, "activity.json"),
+            ),
+            self._download_profile_file(
+                url=f"https://www.chess.com/callback/member/stats/{username}",
+                username=username,
+                filename=self.path_coach_file(username, "stats.json"),
+            ),
+        )
+        if any(used_network):
+            self.log(
+                [
+                    (AnsiColor.INFO, "[INFO]"),
+                    (None, ": Downloaded data for coach "),
+                    (AnsiColor.DATA, username),
+                ]
+            )
+            await asyncio.sleep(SLEEP_SECS)
+        else:
+            self.log(
+                [
+                    (AnsiColor.INFO, "[INFO]"),
+                    (None, ": Skipping download for coach "),
+                    (AnsiColor.DATA, username),
+                ]
+            )
+
+    async def _download_profile_file(self, url: str, username: str, filename: str):
+        """Writes the contents of url into the specified file.
+
+        @param url
+            The URL of the file to download.
+        @param username
+            The coach username corresponding to the downloaded file.
+        @param filename
+            The output file to write the downloaded content to.
+        @return:
+            True if we make a network request. False otherwise.
+        """
+        if os.path.isfile(filename):
+            return False
+
+        response, _unused_status = await self.request(url)
+        if response is not None:
+            with open(filename, "w") as f:
+                f.write(response)
+
+        return True
+
+    def _load_stats_json(self, stats: dict) -> Export:
+        """Extract relevant fields from a `stats.json` file."""
+        export: Export = {}
+        for stat in stats.get("stats", []):
+            if stat["key"] == "rapid":
+                export["fide_rapid"] = stat["stats"]["rating"]
+        return export
+
+    async def export(self, username: str) -> Export:
+        """Transform coach-specific data into uniform format."""
+        stat_export: Export = {}
+        try:
+            with open(self.path_coach_file(username, "stats.json"), "r") as f:
+                stat_export = self._load_stats_json(json.load(f))
+        except FileNotFoundError:
+            pass
+
+        export: Export = {
+            "fide_rapid": None,
+        }
+        export.update(stat_export)
+        return export
--- a/app/scraper.py
+++ b/app/scraper.py
@ -1,200 +1,119 @@
 import aiohttp
-import argparse
-import asyncio
+import enum
+import json
 import os
-import os.path

-from bs4 import BeautifulSoup
+from typing import List, Tuple, Union
+from typing_extensions import TypedDict


-# The root directory containing downloaded files for a coach.
-DATA_COACH_DIR = "data/coach/{username}"
-
-# Where a part of coach-related data is stored.
-DATA_COACH_FILE = "data/coach/{username}/{filename}"
-
-# Where a part of all discovered coach usernames is stored.
-DATA_COACH_LIST = "data/pages/{page_no}.txt"
-
-# The "User-Agent" value set in every request to chess.com.
-USER_AGENT = "BoardWise chesscom-scraper ({user_agent})"
-
-# How long to wait between a batch of network requests.
-SLEEP_SECS = 3
+class Site(enum.Enum):
+    CHESSCOM = "chesscom"
+    LICHESS = "lichess"


-def ANSI_COLOR(s: str):
-    """Print colored output to the console."""
-    return f"\033[0;34m{s}\033[0m"  # Blue
+class AnsiColor(enum.Enum):
+    ERROR = "\033[0;31m"
+    INFO = "\033[0;34m"
+    DATA = "\033[0;36m"
+    RESET = "\033[0m"


-async def chesscom_request(session: aiohttp.ClientSession, url: str):
-    """Convenience function for network requests to chess.com.
+class Export(TypedDict, total=False):
+    fide_rapid: Union[int, None]

-    @param session
+
+class BaseScraper:
+    def __init__(self, site: str, session: aiohttp.ClientSession):
+        """Initialize a new web scraper and exporter.
+
+        @param site:
+            The site we are making requests out to.
+        @param session:
            The `aiohttp.ClientSession` context our requests are made from.
-    @param url
-        The URL to send a request to.
-    @return
-        The text response returned by the server at @url.
        """
-    async with session.get(url) as response:
+        self.site = site
+        self.session = session
+
+    async def download_usernames(self) -> List[str]:
+        """Collect all coach usernames from the specified site."""
+        raise NotImplementedError()
+
+    async def download_profile(self, username: str):
+        """For each coach, download coach-specific data."""
+        raise NotImplementedError()
+
+    async def export(self, username: str) -> Export:
+        """Transform coach-specific data into uniform format."""
+        raise NotImplementedError()
+
+    async def request(self, url: str) -> Tuple[Union[str, None], int]:
+        """Make network requests using the internal session.
+
+        @param url
+            The URL to make a GET request to.
+        @return
+            Tuple containing the response body (if the request was successful)
+            and status code.
+        """
+        async with self.session.get(url) as response:
            if response.status == 200:
-            return await response.text()
-        print(f"Encountered {response.status} when retrieving {url}.")
+                return await response.text(), 200
+        return None, response.status

+    async def scrape(self):
+        """Main entrypoint for scraping and exporting downloaded content.

-async def _scrape_page_coach_usernames(session: aiohttp.ClientSession, page_no: int):
-    """Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
+        A `Scraper` is structured to operates in the following stages:

-    @param session
-        The `aiohttp.ClientSession` context our requests are made from.
-    @param page_no
-        The page consisting of at most 25 coaches (at the time of writing)
-        whose usernames are to be scraped.
-    @return
-        The list of scraped usernames on the specified coach listing page.
+        1. Collect all coach usernames from the specified site.
+        2. For each coach, download coach-specific data.
+        3. Transform this data and export into uniform format.
        """
-    url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
-    response = await chesscom_request(session, url)
-    if response is None:
-        return
+        os.makedirs(self.path_coaches_dir(), exist_ok=True)
+        os.makedirs(self.path_pages_dir(), exist_ok=True)
+        usernames = await self.download_usernames()
+        for username in usernames:
+            os.makedirs(self.path_coach_dir(username), exist_ok=True)
+            await self.download_profile(username)

-    usernames = []
-    soup = BeautifulSoup(response, "html.parser")
-    members = soup.find_all("a", class_="members-categories-username")
-    for member in members:
-        href = member.get("href")
-        username = href[len("https://www.chess.com/member/") :]
-        usernames.append(username)
-
-    return usernames
-
-
-async def _scrape_all_coach_usernames(
-    session: aiohttp.ClientSession, max_pages: int = 64
-):
-    """Scan through chess.com/coaches for all coaches' usernames.
-
-    @param session
-        The `aiohttp.ClientSession` context our requests are made from.
-    @param max_pages
-        The number of pages we will at most iterate through. This number was
-        determined by going to chess.com/coaches?sortBy=alphabetical&page=1
-        and traversing to the last page.
-    @return
-        The complete list of scraped usernames across every coach listing page.
-    """
-    usernames = []
-    for page_no in range(1, max_pages + 1):
-        filepath = DATA_COACH_LIST.format(page_no=page_no)
-        try:
-            with open(filepath, "r") as f:
-                usernames.extend(f.readlines())
-            print(f"Skipping {ANSI_COLOR(filepath)}")
-        except FileNotFoundError:
-            page_usernames = await _scrape_page_coach_usernames(session, page_no)
-            if not page_usernames:
-                print(f"Could not write {ANSI_COLOR(filepath)}")
-                continue
-            with open(filepath, "w") as f:
-                for username in page_usernames:
-                    f.write(f"{username}\n")
-            usernames.extend(page_usernames)
-            print(f"Downloaded {ANSI_COLOR(filepath)}")
-            await asyncio.sleep(SLEEP_SECS)
-
-    return usernames
-
-
-async def _download_coach_file(
-    session: aiohttp.ClientSession, url: str, username: str, filename: str
-):
-    """Writes the contents of @url into `DATA_COACH_FILE`.
-
-    @param session
-        The `aiohttp.ClientSession` context our requests are made from.
-    @param url
-        The URL of the file to download.
-    @param username
-        The coach username corresponding to the downloaded file.
-    @param filename
-        The output file to write the downloaded content to.
-    @return:
-        True if we make a network request. False otherwise.
-    """
-    filepath = DATA_COACH_FILE.format(username=username, filename=filename)
-    if os.path.isfile(filepath):
-        return False
-
-    response = await chesscom_request(session, url)
-    if response is not None:
-        with open(filepath, "w") as f:
-            f.write(response)
-    return True
-
-
-async def _download_coach_data(session: aiohttp.ClientSession, username: str):
-    """Download coach-related data to the `DATA_COACH_DIR` directory.
-
-    This sends three parallel requests for:
-    * the coach's profile,
-    * the coach's recent activity,
-    * the coach's stats.
-
-    @param session
-        The `aiohttp.ClientSession` context our requests are made from.
-    @param username
-        The coach username corresponding to the downloaded files.
-    """
-    used_network = await asyncio.gather(
-        _download_coach_file(
-            session,
-            url=f"https://www.chess.com/member/{username}",
-            username=username,
-            filename=f"{username}.html",
-        ),
-        _download_coach_file(
-            session,
-            url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
-            username=username,
-            filename="activity.json",
-        ),
-        _download_coach_file(
-            session,
-            url=f"https://www.chess.com/callback/member/stats/{username}",
-            username=username,
-            filename="stats.json",
-        ),
+            export = await self.export(username)
+            with open(self.path_coach_file(username, "export.json"), "w") as f:
+                json.dump(export, f)
+            self.log(
+                [
+                    (AnsiColor.INFO, "[INFO]"),
+                    (None, ": Finished exporting "),
+                    (AnsiColor.DATA, username),
+                ]
            )
-    if any(used_network):
-        print(f"Downloaded {ANSI_COLOR(username)}")
-        await asyncio.sleep(SLEEP_SECS)
+
+    def path_coaches_dir(self):
+        """The root directory for all coach-related downloads."""
+        return os.path.join("data", self.site, "coaches")
+
+    def path_coach_dir(self, username: str):
+        """The root directory for a specific coach's downloads."""
+        return os.path.join(self.path_coaches_dir(), username)
+
+    def path_coach_file(self, username: str, filename: str):
+        """Path to a coach-specific file download."""
+        return os.path.join(self.path_coach_dir(username), filename)
+
+    def path_pages_dir(self):
+        """The root directory for all username listing files."""
+        return os.path.join("data", self.site, "pages")
+
+    def path_page_file(self, page_no: int):
+        """The root directory for usernames scraped from a single page."""
+        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
+
+    def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
+        transformed = []
+        for k, v in msgs:
+            if k is None:
+                transformed.append(v)
            else:
-        print(f"Skipping {ANSI_COLOR(username)}")
+                transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")

-
-async def _scrape():
-    parser = argparse.ArgumentParser(
-        prog="chesscom-scraper",
-        description="HTML scraping of chess.com coaches.",
-    )
-    parser.add_argument("-u", "--user-agent", required=True)
-    args = parser.parse_args()
-
-    os.makedirs("data/pages", exist_ok=True)
-    os.makedirs("data/coach", exist_ok=True)
-
-    async with aiohttp.ClientSession(
-        headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)}
-    ) as session:
-        # Retrieve all coaches on the platform.
-        usernames = await _scrape_all_coach_usernames(session)
-        # For each coach, download relevant data.
-        for username in [u.strip() for u in usernames]:
-            os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True)
-            await _download_coach_data(session, username)
-
-
-def run():
-    asyncio.run(_scrape())
+        print("".join(transformed))
--- a/poetry.lock
+++ b/poetry.lock
@ -345,6 +345,17 @@ files = [
    {file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"},
 ]

+[[package]]
+name = "typing-extensions"
+version = "4.8.0"
+description = "Backported and Experimental Type Hints for Python 3.8+"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"},
+    {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"},
+]
+
 [[package]]
 name = "yarl"
 version = "1.9.3"
@ -451,4 +462,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "987c0a45c65fc281154469d795a5dc2828af5fa55226a1688466b71bf4327e3e"
+content-hash = "04db01ae29bbc78abf48f0ae23d60db56da274aea1b281c7aeaca0e705162114"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,10 +12,11 @@ aiohttp = "^3.8.6"

 [tool.poetry.group.dev.dependencies]
 types-beautifulsoup4 = "^4.12.0.7"
+typing-extensions = "^4.8.0"

 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"

 [tool.poetry.scripts]
-app = "app.scraper:run"
+app = "app.__main__:main"