From 10801b560ca65c4395d8be06b6c7d3e5ed83e171 Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Thu, 30 Nov 2023 15:15:15 -0700 Subject: [PATCH] Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. --- README.md | 7 +- app/__main__.py | 39 ++++++- app/chesscom.py | 193 +++++++++++++++++++++++++++++++++++ app/scraper.py | 263 +++++++++++++++++------------------------------- poetry.lock | 13 ++- pyproject.toml | 3 +- 6 files changed, 337 insertions(+), 181 deletions(-) create mode 100644 app/chesscom.py diff --git a/README.md b/README.md index eb1a7cf..faf8f74 100644 --- a/README.md +++ b/README.md @@ -29,15 +29,12 @@ data If you have nix available, run: ```bash -$> nix build -$> result/bin/app --user-agent +$> nix run . -- --user-agent -s chesscom ``` If not, ensure you have [poetry](https://python-poetry.org/) on your machine and instead run the following: ```bash -$> poetry install -$> source $(poetry env info --path)/bin/activate -$> python3 -m app +$> poetry run python3 -m app -u -s chesscom ``` ## Development diff --git a/app/__main__.py b/app/__main__.py index 06e23b7..f2ae0c9 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -1,4 +1,39 @@ -from app import scraper +import aiohttp +import argparse +import asyncio + +from app.chesscom import Scraper as ChesscomScraper +from app.scraper import Site + + +async def run(): + parser = argparse.ArgumentParser( + prog="coach-scraper", + description="HTML scraping of chess.com coaches.", + ) + parser.add_argument("-u", "--user-agent", required=True) + parser.add_argument( + "-s", + "--site", + required=True, + choices=[ + Site.CHESSCOM.value, + ], + ) + args = parser.parse_args() + + async with aiohttp.ClientSession( + headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"} + ) as session: + if args.site == Site.CHESSCOM.value: + scraper = ChesscomScraper(session) + + await scraper.scrape() + + +def main(): + asyncio.run(run()) + if __name__ == "__main__": - scraper.run() + main() diff --git a/app/chesscom.py b/app/chesscom.py new file mode 100644 index 0000000..a2da421 --- /dev/null +++ b/app/chesscom.py @@ -0,0 +1,193 @@ +import aiohttp +import asyncio +import json +import os +import os.path + +from app.scraper import AnsiColor, BaseScraper, Export, Site +from bs4 import BeautifulSoup +from typing import List + + +# The number of coach listing pages we will at most iterate through. This number +# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and +# traversing to the last page. +MAX_PAGES = 64 + +# How long to wait between a batch of network requests. +SLEEP_SECS = 3 + + +class Scraper(BaseScraper): + def __init__(self, session: aiohttp.ClientSession): + super().__init__(site=Site.CHESSCOM.value, session=session) + + async def download_usernames(self) -> List[str]: + """Scan through chess.com/coaches for all coaches' usernames. + + @return + The complete list of scraped usernames across every coach listing + page. + """ + usernames = [] + for page_no in range(1, MAX_PAGES + 1): + filepath = self.path_page_file(page_no) + try: + with open(filepath, "r") as f: + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Reading file "), + (AnsiColor.DATA, filepath), + ] + ) + usernames.extend([line.strip() for line in f.readlines()]) + except FileNotFoundError: + page_usernames = await self._scrape_page(page_no) + if not page_usernames: + self.log( + [ + (AnsiColor.ERROR, "[ERROR]"), + (None, ": Could not scrape page "), + (AnsiColor.DATA, str(page_no)), + ] + ) + continue + with open(filepath, "w") as f: + for username in page_usernames: + f.write(f"{username}\n") + usernames.extend(page_usernames) + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Downloaded page "), + (AnsiColor.DATA, filepath), + ] + ) + await asyncio.sleep(SLEEP_SECS) + + return usernames + + async def _scrape_page(self, page_no: int) -> List[str]: + """Scan through chess.com/coaches/?page= for all coaches' usernames. + + @param page_no + The page consisting of at most 25 coaches (at the time of writing) + whose usernames are to be scraped. + @return + The list of scraped usernames on the specified coach listing page. + """ + url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" + response, status_code = await self.request(url) + if response is None: + self.log( + [ + (AnsiColor.ERROR, "[ERROR]"), + (None, ": Received status "), + (AnsiColor.DATA, f"{status_code} "), + (None, "when downloading page "), + (AnsiColor.DATA, str(page_no)), + ] + ) + return + + usernames = [] + soup = BeautifulSoup(response, "html.parser") + members = soup.find_all("a", class_="members-categories-username") + for member in members: + href = member.get("href") + username = href[len("https://www.chess.com/member/") :] + usernames.append(username) + + return usernames + + async def download_profile(self, username: str): + """For each coach, download coach-specific data. + + This sends three parallel requests for: + * the coach's profile, + * the coach's recent activity, + * the coach's stats. + + @param username + The coach username corresponding to the downloaded files. + """ + used_network = await asyncio.gather( + self._download_profile_file( + url=f"https://www.chess.com/member/{username}", + username=username, + filename=self.path_coach_file(username, f"{username}.html"), + ), + self._download_profile_file( + url=f"https://www.chess.com/callback/member/activity/{username}?page=1", + username=username, + filename=self.path_coach_file(username, "activity.json"), + ), + self._download_profile_file( + url=f"https://www.chess.com/callback/member/stats/{username}", + username=username, + filename=self.path_coach_file(username, "stats.json"), + ), + ) + if any(used_network): + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Downloaded data for coach "), + (AnsiColor.DATA, username), + ] + ) + await asyncio.sleep(SLEEP_SECS) + else: + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Skipping download for coach "), + (AnsiColor.DATA, username), + ] + ) + + async def _download_profile_file(self, url: str, username: str, filename: str): + """Writes the contents of url into the specified file. + + @param url + The URL of the file to download. + @param username + The coach username corresponding to the downloaded file. + @param filename + The output file to write the downloaded content to. + @return: + True if we make a network request. False otherwise. + """ + if os.path.isfile(filename): + return False + + response, _unused_status = await self.request(url) + if response is not None: + with open(filename, "w") as f: + f.write(response) + + return True + + def _load_stats_json(self, stats: dict) -> Export: + """Extract relevant fields from a `stats.json` file.""" + export: Export = {} + for stat in stats.get("stats", []): + if stat["key"] == "rapid": + export["fide_rapid"] = stat["stats"]["rating"] + return export + + async def export(self, username: str) -> Export: + """Transform coach-specific data into uniform format.""" + stat_export: Export = {} + try: + with open(self.path_coach_file(username, "stats.json"), "r") as f: + stat_export = self._load_stats_json(json.load(f)) + except FileNotFoundError: + pass + + export: Export = { + "fide_rapid": None, + } + export.update(stat_export) + return export diff --git a/app/scraper.py b/app/scraper.py index 0df552f..7d7727b 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -1,200 +1,119 @@ import aiohttp -import argparse -import asyncio +import enum +import json import os -import os.path -from bs4 import BeautifulSoup +from typing import List, Tuple, Union +from typing_extensions import TypedDict -# The root directory containing downloaded files for a coach. -DATA_COACH_DIR = "data/coach/{username}" - -# Where a part of coach-related data is stored. -DATA_COACH_FILE = "data/coach/{username}/{filename}" - -# Where a part of all discovered coach usernames is stored. -DATA_COACH_LIST = "data/pages/{page_no}.txt" - -# The "User-Agent" value set in every request to chess.com. -USER_AGENT = "BoardWise chesscom-scraper ({user_agent})" - -# How long to wait between a batch of network requests. -SLEEP_SECS = 3 +class Site(enum.Enum): + CHESSCOM = "chesscom" + LICHESS = "lichess" -def ANSI_COLOR(s: str): - """Print colored output to the console.""" - return f"\033[0;34m{s}\033[0m" # Blue +class AnsiColor(enum.Enum): + ERROR = "\033[0;31m" + INFO = "\033[0;34m" + DATA = "\033[0;36m" + RESET = "\033[0m" -async def chesscom_request(session: aiohttp.ClientSession, url: str): - """Convenience function for network requests to chess.com. - - @param session - The `aiohttp.ClientSession` context our requests are made from. - @param url - The URL to send a request to. - @return - The text response returned by the server at @url. - """ - async with session.get(url) as response: - if response.status == 200: - return await response.text() - print(f"Encountered {response.status} when retrieving {url}.") +class Export(TypedDict, total=False): + fide_rapid: Union[int, None] -async def _scrape_page_coach_usernames(session: aiohttp.ClientSession, page_no: int): - """Scan through chess.com/coaches/?page= for all coaches' usernames. +class BaseScraper: + def __init__(self, site: str, session: aiohttp.ClientSession): + """Initialize a new web scraper and exporter. - @param session - The `aiohttp.ClientSession` context our requests are made from. - @param page_no - The page consisting of at most 25 coaches (at the time of writing) - whose usernames are to be scraped. - @return - The list of scraped usernames on the specified coach listing page. - """ - url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" - response = await chesscom_request(session, url) - if response is None: - return + @param site: + The site we are making requests out to. + @param session: + The `aiohttp.ClientSession` context our requests are made from. + """ + self.site = site + self.session = session - usernames = [] - soup = BeautifulSoup(response, "html.parser") - members = soup.find_all("a", class_="members-categories-username") - for member in members: - href = member.get("href") - username = href[len("https://www.chess.com/member/") :] - usernames.append(username) + async def download_usernames(self) -> List[str]: + """Collect all coach usernames from the specified site.""" + raise NotImplementedError() - return usernames + async def download_profile(self, username: str): + """For each coach, download coach-specific data.""" + raise NotImplementedError() + async def export(self, username: str) -> Export: + """Transform coach-specific data into uniform format.""" + raise NotImplementedError() -async def _scrape_all_coach_usernames( - session: aiohttp.ClientSession, max_pages: int = 64 -): - """Scan through chess.com/coaches for all coaches' usernames. + async def request(self, url: str) -> Tuple[Union[str, None], int]: + """Make network requests using the internal session. - @param session - The `aiohttp.ClientSession` context our requests are made from. - @param max_pages - The number of pages we will at most iterate through. This number was - determined by going to chess.com/coaches?sortBy=alphabetical&page=1 - and traversing to the last page. - @return - The complete list of scraped usernames across every coach listing page. - """ - usernames = [] - for page_no in range(1, max_pages + 1): - filepath = DATA_COACH_LIST.format(page_no=page_no) - try: - with open(filepath, "r") as f: - usernames.extend(f.readlines()) - print(f"Skipping {ANSI_COLOR(filepath)}") - except FileNotFoundError: - page_usernames = await _scrape_page_coach_usernames(session, page_no) - if not page_usernames: - print(f"Could not write {ANSI_COLOR(filepath)}") - continue - with open(filepath, "w") as f: - for username in page_usernames: - f.write(f"{username}\n") - usernames.extend(page_usernames) - print(f"Downloaded {ANSI_COLOR(filepath)}") - await asyncio.sleep(SLEEP_SECS) + @param url + The URL to make a GET request to. + @return + Tuple containing the response body (if the request was successful) + and status code. + """ + async with self.session.get(url) as response: + if response.status == 200: + return await response.text(), 200 + return None, response.status - return usernames + async def scrape(self): + """Main entrypoint for scraping and exporting downloaded content. + A `Scraper` is structured to operates in the following stages: -async def _download_coach_file( - session: aiohttp.ClientSession, url: str, username: str, filename: str -): - """Writes the contents of @url into `DATA_COACH_FILE`. + 1. Collect all coach usernames from the specified site. + 2. For each coach, download coach-specific data. + 3. Transform this data and export into uniform format. + """ + os.makedirs(self.path_coaches_dir(), exist_ok=True) + os.makedirs(self.path_pages_dir(), exist_ok=True) + usernames = await self.download_usernames() + for username in usernames: + os.makedirs(self.path_coach_dir(username), exist_ok=True) + await self.download_profile(username) - @param session - The `aiohttp.ClientSession` context our requests are made from. - @param url - The URL of the file to download. - @param username - The coach username corresponding to the downloaded file. - @param filename - The output file to write the downloaded content to. - @return: - True if we make a network request. False otherwise. - """ - filepath = DATA_COACH_FILE.format(username=username, filename=filename) - if os.path.isfile(filepath): - return False + export = await self.export(username) + with open(self.path_coach_file(username, "export.json"), "w") as f: + json.dump(export, f) + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Finished exporting "), + (AnsiColor.DATA, username), + ] + ) - response = await chesscom_request(session, url) - if response is not None: - with open(filepath, "w") as f: - f.write(response) - return True + def path_coaches_dir(self): + """The root directory for all coach-related downloads.""" + return os.path.join("data", self.site, "coaches") + def path_coach_dir(self, username: str): + """The root directory for a specific coach's downloads.""" + return os.path.join(self.path_coaches_dir(), username) -async def _download_coach_data(session: aiohttp.ClientSession, username: str): - """Download coach-related data to the `DATA_COACH_DIR` directory. + def path_coach_file(self, username: str, filename: str): + """Path to a coach-specific file download.""" + return os.path.join(self.path_coach_dir(username), filename) - This sends three parallel requests for: - * the coach's profile, - * the coach's recent activity, - * the coach's stats. + def path_pages_dir(self): + """The root directory for all username listing files.""" + return os.path.join("data", self.site, "pages") - @param session - The `aiohttp.ClientSession` context our requests are made from. - @param username - The coach username corresponding to the downloaded files. - """ - used_network = await asyncio.gather( - _download_coach_file( - session, - url=f"https://www.chess.com/member/{username}", - username=username, - filename=f"{username}.html", - ), - _download_coach_file( - session, - url=f"https://www.chess.com/callback/member/activity/{username}?page=1", - username=username, - filename="activity.json", - ), - _download_coach_file( - session, - url=f"https://www.chess.com/callback/member/stats/{username}", - username=username, - filename="stats.json", - ), - ) - if any(used_network): - print(f"Downloaded {ANSI_COLOR(username)}") - await asyncio.sleep(SLEEP_SECS) - else: - print(f"Skipping {ANSI_COLOR(username)}") + def path_page_file(self, page_no: int): + """The root directory for usernames scraped from a single page.""" + return os.path.join(self.path_pages_dir(), f"{page_no}.txt") + def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]): + transformed = [] + for k, v in msgs: + if k is None: + transformed.append(v) + else: + transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}") -async def _scrape(): - parser = argparse.ArgumentParser( - prog="chesscom-scraper", - description="HTML scraping of chess.com coaches.", - ) - parser.add_argument("-u", "--user-agent", required=True) - args = parser.parse_args() - - os.makedirs("data/pages", exist_ok=True) - os.makedirs("data/coach", exist_ok=True) - - async with aiohttp.ClientSession( - headers={"User-Agent": USER_AGENT.format(user_agent=args.user_agent)} - ) as session: - # Retrieve all coaches on the platform. - usernames = await _scrape_all_coach_usernames(session) - # For each coach, download relevant data. - for username in [u.strip() for u in usernames]: - os.makedirs(DATA_COACH_DIR.format(username=username), exist_ok=True) - await _download_coach_data(session, username) - - -def run(): - asyncio.run(_scrape()) + print("".join(transformed)) diff --git a/poetry.lock b/poetry.lock index 90111c3..c5f6cee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -345,6 +345,17 @@ files = [ {file = "types_html5lib-1.1.11.15-py3-none-any.whl", hash = "sha256:16fe936d99b9f7fc210e2e21a2aed1b6bbbc554ad8242a6ef75f6f2bddb27e58"}, ] +[[package]] +name = "typing-extensions" +version = "4.8.0" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, + {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, +] + [[package]] name = "yarl" version = "1.9.3" @@ -451,4 +462,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "987c0a45c65fc281154469d795a5dc2828af5fa55226a1688466b71bf4327e3e" +content-hash = "04db01ae29bbc78abf48f0ae23d60db56da274aea1b281c7aeaca0e705162114" diff --git a/pyproject.toml b/pyproject.toml index 5e13dbf..1329906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,10 +12,11 @@ aiohttp = "^3.8.6" [tool.poetry.group.dev.dependencies] types-beautifulsoup4 = "^4.12.0.7" +typing-extensions = "^4.8.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -app = "app.scraper:run" +app = "app.__main__:main"