Add a scraper for lichess. (#2)

2023-11-30 15:36:44 -07:00 · 2023-11-30 15:36:44 -07:00 · bc2ffeae9d
parent 10801b560c
commit bc2ffeae9d
4 changed files with 153 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -1,25 +1,29 @@
-# chesscom-scraper
+# coach-scraper

 **Caution! Be careful running this script.**

-We intentionally delay each batch of requests by 3 seconds. Make sure any
-adjustments to this script appropriately rate-limit.
+We intentionally delay each batch of requests. Make sure any adjustments to this
+script appropriately rate-limit.

 ## Overview

-This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
-coaches. The program searches for all listed coaches as well as specific
-information about each of them (their profile, recent activity, and stats). The
-result will be found in a newly created `data` directory with the following
-structure:
+This is a simple web scraper for coaches listed on:
+
+* [chess.com](https://www.chess.com/coaches)
+* [lichess.org](https://www.lichess.org/coach)
+
+The program searches for coach usernames as well as specific information about
+each of them (their profile, recent activity, and stats). The result will be
+found in a newly created `data` directory with the following structure:
 ```
 data
-├── coach
-│   ├── <username>
-│   │   ├── <username>.html
-│   │   ├── activity.json
-│   │   └── stats.json
-│   ├── ...
+└── <site>
+│   ├── coaches
+│   │   ├── <username>
+│   │   │   ├── <username>.html
+│   │   │   ├── export.json
+│   │   │   └── ...
+│   │   ├── ...
 └── pages
    ├── <n>.txt
    ├── ...
--- a/app/main.py
+++ b/app/main.py
@ -3,6 +3,7 @@ import argparse
 import asyncio

 from app.chesscom import Scraper as ChesscomScraper
+from app.lichess import Scraper as LichessScraper
 from app.scraper import Site


@ -18,6 +19,7 @@ async def run():
        required=True,
        choices=[
            Site.CHESSCOM.value,
+            Site.LICHESS.value,
        ],
    )
    args = parser.parse_args()
@ -27,6 +29,8 @@ async def run():
    ) as session:
        if args.site == Site.CHESSCOM.value:
            scraper = ChesscomScraper(session)
+        elif args.site == Site.LICHESS.value:
+            scraper = LichessScraper(session)

        await scraper.scrape()

--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -179,15 +179,12 @@ class Scraper(BaseScraper):

    async def export(self, username: str) -> Export:
        """Transform coach-specific data into uniform format."""
-        stat_export: Export = {}
+        export: Export = {}
+
        try:
            with open(self.path_coach_file(username, "stats.json"), "r") as f:
-                stat_export = self._load_stats_json(json.load(f))
+                export.update(self._load_stats_json(json.load(f)))
        except FileNotFoundError:
            pass

-        export: Export = {
-            "fide_rapid": None,
-        }
-        export.update(stat_export)
        return export
--- a/app/lichess.py
+++ b/app/lichess.py
@ -0,0 +1,128 @@
+import aiohttp
+import asyncio
+import os
+import os.path
+
+from app.scraper import AnsiColor, BaseScraper, Export, Site
+from bs4 import BeautifulSoup
+from typing import List
+
+
+# The number of pages we will at most iterate through. This number was
+# determined by going to https://lichess.org/coach/all/all/alphabetical
+# and traversing to the last page.
+MAX_PAGES = 162
+
+# How long to wait between each network request.
+SLEEP_SECS = 5
+
+
+class Scraper(BaseScraper):
+    def __init__(self, session: aiohttp.ClientSession):
+        super().__init__(site=Site.LICHESS.value, session=session)
+
+    async def download_usernames(self) -> List[str]:
+        """Scan through lichess.org/coach for all coaches' usernames.
+
+        @return
+            The complete list of scraped usernames across every coach listing
+            page.
+        """
+        usernames = []
+        for page_no in range(1, MAX_PAGES + 1):
+            filepath = self.path_page_file(page_no)
+            try:
+                with open(filepath, "r") as f:
+                    self.log(
+                        [
+                            (AnsiColor.INFO, "[INFO]"),
+                            (None, ": Reading file "),
+                            (AnsiColor.DATA, filepath),
+                        ]
+                    )
+                    usernames.extend([line.strip() for line in f.readlines()])
+            except FileNotFoundError:
+                page_usernames = await self._scrape_page(page_no)
+                if not page_usernames:
+                    self.log(
+                        [
+                            (AnsiColor.ERROR, "[ERROR]"),
+                            (None, ": Could not scrape page "),
+                            (AnsiColor.DATA, str(page_no)),
+                        ]
+                    )
+                    continue
+                with open(filepath, "w") as f:
+                    for username in page_usernames:
+                        f.write(f"{username}\n")
+                usernames.extend(page_usernames)
+                self.log(
+                    [
+                        (AnsiColor.INFO, "[INFO]"),
+                        (None, ": Downloaded page "),
+                        (AnsiColor.DATA, filepath),
+                    ]
+                )
+                await asyncio.sleep(SLEEP_SECS)
+
+        return usernames
+
+    async def _scrape_page(self, page_no: int):
+        """Scan through lichess.org/coach/.../?page=<n> for all coaches'
+        usernames.
+
+        @param page_no
+            The page consisting of at most 10 coaches (at the time of writing)
+            whose usernames are to be scraped.
+        @return
+            The list of scraped usernames on the specified coach listing page.
+        """
+        url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
+        response, status_code = await self.request(url)
+        if response is None:
+            self.log(
+                [
+                    (AnsiColor.ERROR, "[ERROR]"),
+                    (None, ": Received status "),
+                    (AnsiColor.DATA, f"{status_code} "),
+                    (None, "when downloading page "),
+                    (AnsiColor.DATA, str(page_no)),
+                ]
+            )
+            return
+
+        usernames = []
+        soup = BeautifulSoup(response, "html.parser")
+        members = soup.find_all("article", class_="coach-widget")
+        for member in members:
+            anchor = member.find("a", class_="overlay")
+            if anchor:
+                href = anchor.get("href")
+                username = href[len("/coach/") :]
+                usernames.append(username)
+
+        return usernames
+
+    async def download_profile(self, username: str):
+        """For each coach, download coach-specific data.
+
+        @param username
+            The coach username corresponding to the downloaded files.
+        """
+        filepath = self.path_coach_file(username, f"{username}.html")
+        if os.path.isfile(filepath):
+            return False
+
+        response, _unused_status = await self.request(
+            url=f"https://lichess.org/coach/{username}"
+        )
+        if response is not None:
+            with open(filepath, "w") as f:
+                f.write(response)
+
+        return True
+
+    async def export(self, username: str) -> Export:
+        """Transform coach-specific data into uniform format."""
+        export: Export = {}
+        return export