From bc2ffeae9dc8cfb8c38ad0af4d6d596461a38451 Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Thu, 30 Nov 2023 15:36:44 -0700 Subject: [PATCH] Add a scraper for lichess. (#2) --- README.md | 32 ++++++------ app/__main__.py | 4 ++ app/chesscom.py | 9 ++-- app/lichess.py | 128 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 20 deletions(-) create mode 100644 app/lichess.py diff --git a/README.md b/README.md index faf8f74..03ed6e4 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,29 @@ -# chesscom-scraper +# coach-scraper **Caution! Be careful running this script.** -We intentionally delay each batch of requests by 3 seconds. Make sure any -adjustments to this script appropriately rate-limit. +We intentionally delay each batch of requests. Make sure any adjustments to this +script appropriately rate-limit. ## Overview -This is a simple web scraper for [chess.com](https://www.chess.com/coaches) -coaches. The program searches for all listed coaches as well as specific -information about each of them (their profile, recent activity, and stats). The -result will be found in a newly created `data` directory with the following -structure: +This is a simple web scraper for coaches listed on: + +* [chess.com](https://www.chess.com/coaches) +* [lichess.org](https://www.lichess.org/coach) + +The program searches for coach usernames as well as specific information about +each of them (their profile, recent activity, and stats). The result will be +found in a newly created `data` directory with the following structure: ``` data -├── coach -│ ├── -│ │ ├── .html -│ │ ├── activity.json -│ │ └── stats.json -│ ├── ... +└── +│ ├── coaches +│ │ ├── +│ │ │ ├── .html +│ │ │ ├── export.json +│ │ │ └── ... +│ │ ├── ... └── pages ├── .txt ├── ... diff --git a/app/__main__.py b/app/__main__.py index f2ae0c9..f6f9a19 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -3,6 +3,7 @@ import argparse import asyncio from app.chesscom import Scraper as ChesscomScraper +from app.lichess import Scraper as LichessScraper from app.scraper import Site @@ -18,6 +19,7 @@ async def run(): required=True, choices=[ Site.CHESSCOM.value, + Site.LICHESS.value, ], ) args = parser.parse_args() @@ -27,6 +29,8 @@ async def run(): ) as session: if args.site == Site.CHESSCOM.value: scraper = ChesscomScraper(session) + elif args.site == Site.LICHESS.value: + scraper = LichessScraper(session) await scraper.scrape() diff --git a/app/chesscom.py b/app/chesscom.py index a2da421..9b24b95 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -179,15 +179,12 @@ class Scraper(BaseScraper): async def export(self, username: str) -> Export: """Transform coach-specific data into uniform format.""" - stat_export: Export = {} + export: Export = {} + try: with open(self.path_coach_file(username, "stats.json"), "r") as f: - stat_export = self._load_stats_json(json.load(f)) + export.update(self._load_stats_json(json.load(f))) except FileNotFoundError: pass - export: Export = { - "fide_rapid": None, - } - export.update(stat_export) return export diff --git a/app/lichess.py b/app/lichess.py new file mode 100644 index 0000000..39eebf8 --- /dev/null +++ b/app/lichess.py @@ -0,0 +1,128 @@ +import aiohttp +import asyncio +import os +import os.path + +from app.scraper import AnsiColor, BaseScraper, Export, Site +from bs4 import BeautifulSoup +from typing import List + + +# The number of pages we will at most iterate through. This number was +# determined by going to https://lichess.org/coach/all/all/alphabetical +# and traversing to the last page. +MAX_PAGES = 162 + +# How long to wait between each network request. +SLEEP_SECS = 5 + + +class Scraper(BaseScraper): + def __init__(self, session: aiohttp.ClientSession): + super().__init__(site=Site.LICHESS.value, session=session) + + async def download_usernames(self) -> List[str]: + """Scan through lichess.org/coach for all coaches' usernames. + + @return + The complete list of scraped usernames across every coach listing + page. + """ + usernames = [] + for page_no in range(1, MAX_PAGES + 1): + filepath = self.path_page_file(page_no) + try: + with open(filepath, "r") as f: + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Reading file "), + (AnsiColor.DATA, filepath), + ] + ) + usernames.extend([line.strip() for line in f.readlines()]) + except FileNotFoundError: + page_usernames = await self._scrape_page(page_no) + if not page_usernames: + self.log( + [ + (AnsiColor.ERROR, "[ERROR]"), + (None, ": Could not scrape page "), + (AnsiColor.DATA, str(page_no)), + ] + ) + continue + with open(filepath, "w") as f: + for username in page_usernames: + f.write(f"{username}\n") + usernames.extend(page_usernames) + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Downloaded page "), + (AnsiColor.DATA, filepath), + ] + ) + await asyncio.sleep(SLEEP_SECS) + + return usernames + + async def _scrape_page(self, page_no: int): + """Scan through lichess.org/coach/.../?page= for all coaches' + usernames. + + @param page_no + The page consisting of at most 10 coaches (at the time of writing) + whose usernames are to be scraped. + @return + The list of scraped usernames on the specified coach listing page. + """ + url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}" + response, status_code = await self.request(url) + if response is None: + self.log( + [ + (AnsiColor.ERROR, "[ERROR]"), + (None, ": Received status "), + (AnsiColor.DATA, f"{status_code} "), + (None, "when downloading page "), + (AnsiColor.DATA, str(page_no)), + ] + ) + return + + usernames = [] + soup = BeautifulSoup(response, "html.parser") + members = soup.find_all("article", class_="coach-widget") + for member in members: + anchor = member.find("a", class_="overlay") + if anchor: + href = anchor.get("href") + username = href[len("/coach/") :] + usernames.append(username) + + return usernames + + async def download_profile(self, username: str): + """For each coach, download coach-specific data. + + @param username + The coach username corresponding to the downloaded files. + """ + filepath = self.path_coach_file(username, f"{username}.html") + if os.path.isfile(filepath): + return False + + response, _unused_status = await self.request( + url=f"https://lichess.org/coach/{username}" + ) + if response is not None: + with open(filepath, "w") as f: + f.write(response) + + return True + + async def export(self, username: str) -> Export: + """Transform coach-specific data into uniform format.""" + export: Export = {} + return export