From bc2ffeae9dc8cfb8c38ad0af4d6d596461a38451 Mon Sep 17 00:00:00 2001
From: Joshua Potter <jrpotter2112@gmail.com>
Date: Thu, 30 Nov 2023 15:36:44 -0700
Subject: [PATCH] Add a scraper for lichess. (#2)

---
 README.md       |  32 ++++++------
 app/__main__.py |   4 ++
 app/chesscom.py |   9 ++--
 app/lichess.py  | 128 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 20 deletions(-)
 create mode 100644 app/lichess.py
diff --git a/README.md b/README.md
index faf8f74..03ed6e4 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,29 @@
-# chesscom-scraper
+# coach-scraper
 
 **Caution! Be careful running this script.**
 
-We intentionally delay each batch of requests by 3 seconds. Make sure any
-adjustments to this script appropriately rate-limit.
+We intentionally delay each batch of requests. Make sure any adjustments to this
+script appropriately rate-limit.
 
 ## Overview
 
-This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
-coaches. The program searches for all listed coaches as well as specific
-information about each of them (their profile, recent activity, and stats). The
-result will be found in a newly created `data` directory with the following
-structure:
+This is a simple web scraper for coaches listed on:
+
+* [chess.com](https://www.chess.com/coaches)
+* [lichess.org](https://www.lichess.org/coach)
+
+The program searches for coach usernames as well as specific information about
+each of them (their profile, recent activity, and stats). The result will be
+found in a newly created `data` directory with the following structure:
 ```
 data
-├── coach
-│   ├── <username>
-│   │   ├── <username>.html
-│   │   ├── activity.json
-│   │   └── stats.json
-│   ├── ...
+└── <site>
+│   ├── coaches
+│   │   ├── <username>
+│   │   │   ├── <username>.html
+│   │   │   ├── export.json
+│   │   │   └── ...
+│   │   ├── ...
 └── pages
     ├── <n>.txt
     ├── ...
diff --git a/app/__main__.py b/app/__main__.py
index f2ae0c9..f6f9a19 100644
--- a/app/__main__.py
+++ b/app/__main__.py
@@ -3,6 +3,7 @@ import argparse
 import asyncio
 
 from app.chesscom import Scraper as ChesscomScraper
+from app.lichess import Scraper as LichessScraper
 from app.scraper import Site
 
 
@@ -18,6 +19,7 @@ async def run():
         required=True,
         choices=[
             Site.CHESSCOM.value,
+            Site.LICHESS.value,
         ],
     )
     args = parser.parse_args()
@@ -27,6 +29,8 @@ async def run():
     ) as session:
         if args.site == Site.CHESSCOM.value:
             scraper = ChesscomScraper(session)
+        elif args.site == Site.LICHESS.value:
+            scraper = LichessScraper(session)
 
         await scraper.scrape()
 
diff --git a/app/chesscom.py b/app/chesscom.py
index a2da421..9b24b95 100644
--- a/app/chesscom.py
+++ b/app/chesscom.py
@@ -179,15 +179,12 @@ class Scraper(BaseScraper):
 
     async def export(self, username: str) -> Export:
         """Transform coach-specific data into uniform format."""
-        stat_export: Export = {}
+        export: Export = {}
+
         try:
             with open(self.path_coach_file(username, "stats.json"), "r") as f:
-                stat_export = self._load_stats_json(json.load(f))
+                export.update(self._load_stats_json(json.load(f)))
         except FileNotFoundError:
             pass
 
-        export: Export = {
-            "fide_rapid": None,
-        }
-        export.update(stat_export)
         return export
diff --git a/app/lichess.py b/app/lichess.py
new file mode 100644
index 0000000..39eebf8
--- /dev/null
+++ b/app/lichess.py
@@ -0,0 +1,128 @@
+import aiohttp
+import asyncio
+import os
+import os.path
+
+from app.scraper import AnsiColor, BaseScraper, Export, Site
+from bs4 import BeautifulSoup
+from typing import List
+
+
+# The number of pages we will at most iterate through. This number was
+# determined by going to https://lichess.org/coach/all/all/alphabetical
+# and traversing to the last page.
+MAX_PAGES = 162
+
+# How long to wait between each network request.
+SLEEP_SECS = 5
+
+
+class Scraper(BaseScraper):
+    def __init__(self, session: aiohttp.ClientSession):
+        super().__init__(site=Site.LICHESS.value, session=session)
+
+    async def download_usernames(self) -> List[str]:
+        """Scan through lichess.org/coach for all coaches' usernames.
+
+        @return
+            The complete list of scraped usernames across every coach listing
+            page.
+        """
+        usernames = []
+        for page_no in range(1, MAX_PAGES + 1):
+            filepath = self.path_page_file(page_no)
+            try:
+                with open(filepath, "r") as f:
+                    self.log(
+                        [
+                            (AnsiColor.INFO, "[INFO]"),
+                            (None, ": Reading file "),
+                            (AnsiColor.DATA, filepath),
+                        ]
+                    )
+                    usernames.extend([line.strip() for line in f.readlines()])
+            except FileNotFoundError:
+                page_usernames = await self._scrape_page(page_no)
+                if not page_usernames:
+                    self.log(
+                        [
+                            (AnsiColor.ERROR, "[ERROR]"),
+                            (None, ": Could not scrape page "),
+                            (AnsiColor.DATA, str(page_no)),
+                        ]
+                    )
+                    continue
+                with open(filepath, "w") as f:
+                    for username in page_usernames:
+                        f.write(f"{username}\n")
+                usernames.extend(page_usernames)
+                self.log(
+                    [
+                        (AnsiColor.INFO, "[INFO]"),
+                        (None, ": Downloaded page "),
+                        (AnsiColor.DATA, filepath),
+                    ]
+                )
+                await asyncio.sleep(SLEEP_SECS)
+
+        return usernames
+
+    async def _scrape_page(self, page_no: int):
+        """Scan through lichess.org/coach/.../?page=<n> for all coaches'
+        usernames.
+
+        @param page_no
+            The page consisting of at most 10 coaches (at the time of writing)
+            whose usernames are to be scraped.
+        @return
+            The list of scraped usernames on the specified coach listing page.
+        """
+        url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
+        response, status_code = await self.request(url)
+        if response is None:
+            self.log(
+                [
+                    (AnsiColor.ERROR, "[ERROR]"),
+                    (None, ": Received status "),
+                    (AnsiColor.DATA, f"{status_code} "),
+                    (None, "when downloading page "),
+                    (AnsiColor.DATA, str(page_no)),
+                ]
+            )
+            return
+
+        usernames = []
+        soup = BeautifulSoup(response, "html.parser")
+        members = soup.find_all("article", class_="coach-widget")
+        for member in members:
+            anchor = member.find("a", class_="overlay")
+            if anchor:
+                href = anchor.get("href")
+                username = href[len("/coach/") :]
+                usernames.append(username)
+
+        return usernames
+
+    async def download_profile(self, username: str):
+        """For each coach, download coach-specific data.
+
+        @param username
+            The coach username corresponding to the downloaded files.
+        """
+        filepath = self.path_coach_file(username, f"{username}.html")
+        if os.path.isfile(filepath):
+            return False
+
+        response, _unused_status = await self.request(
+            url=f"https://lichess.org/coach/{username}"
+        )
+        if response is not None:
+            with open(filepath, "w") as f:
+                f.write(response)
+
+        return True
+
+    async def export(self, username: str) -> Export:
+        """Transform coach-specific data into uniform format."""
+        export: Export = {}
+        return export