diff --git a/app/__main__.py b/app/__main__.py index f6f9a19..ec45335 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -1,16 +1,23 @@ import aiohttp import argparse import asyncio +import json -from app.chesscom import Scraper as ChesscomScraper -from app.lichess import Scraper as LichessScraper -from app.scraper import Site +from app.chesscom import ( + Exporter as ChesscomExporter, + Scraper as ChesscomScraper, +) +from app.lichess import ( + Exporter as LichessExporter, + Scraper as LichessScraper, +) +from app.repo import Site async def run(): parser = argparse.ArgumentParser( prog="coach-scraper", - description="HTML scraping of chess.com coaches.", + description="Scraping/exporting of chess coaches.", ) parser.add_argument("-u", "--user-agent", required=True) parser.add_argument( @@ -29,10 +36,19 @@ async def run(): ) as session: if args.site == Site.CHESSCOM.value: scraper = ChesscomScraper(session) + exporter_cls = ChesscomExporter elif args.site == Site.LICHESS.value: scraper = LichessScraper(session) + exporter_cls = LichessExporter - await scraper.scrape() + dump = {} + + usernames = await scraper.scrape() + for username in usernames: + dump[username] = exporter_cls(username).export() + + with open(scraper.path_site_file("export.json"), "w") as f: + json.dump(dump, f, indent=2) def main(): diff --git a/app/chesscom.py b/app/chesscom.py index 9b24b95..2cec3d3 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -4,9 +4,11 @@ import json import os import os.path -from app.scraper import AnsiColor, BaseScraper, Export, Site +from app.repo import AnsiColor, Site +from app.exporter import BaseExporter +from app.scraper import BaseScraper from bs4 import BeautifulSoup -from typing import List +from typing import List, Union # The number of coach listing pages we will at most iterate through. This number @@ -169,22 +171,19 @@ class Scraper(BaseScraper): return True - def _load_stats_json(self, stats: dict) -> Export: - """Extract relevant fields from a `stats.json` file.""" - export: Export = {} - for stat in stats.get("stats", []): - if stat["key"] == "rapid": - export["fide_rapid"] = stat["stats"]["rating"] - return export - async def export(self, username: str) -> Export: - """Transform coach-specific data into uniform format.""" - export: Export = {} +class Exporter(BaseExporter): + def __init__(self, username: str): + super().__init__(site=Site.CHESSCOM.value, username=username) + self.stats_json = {} try: with open(self.path_coach_file(username, "stats.json"), "r") as f: - export.update(self._load_stats_json(json.load(f))) + for s in json.load(f).get("stats", []): + if "key" in s and "stats" in s: + self.stats_json[s["key"]] = s["stats"] except FileNotFoundError: pass - return export + def export_fide_rapid(self) -> Union[int, None]: + return self.stats_json.get("rapid", {}).get("rating") diff --git a/app/exporter.py b/app/exporter.py new file mode 100644 index 0000000..0dbbb1a --- /dev/null +++ b/app/exporter.py @@ -0,0 +1,34 @@ +from app.repo import AnsiColor, Repo +from typing import Union +from typing_extensions import TypedDict + + +class Export(TypedDict, total=False): + fide_rapid: int + + +class BaseExporter(Repo): + def __init__(self, site: str, username: str): + super().__init__(site) + self.username = username + + def export_fide_rapid(self) -> Union[int, None]: + raise NotImplementedError() + + def export(self) -> Export: + """Transform coach-specific data into uniform format.""" + export: Export = {} + + fide_rapid = self.export_fide_rapid() + if fide_rapid: + export["fide_rapid"] = fide_rapid + + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Exported "), + (AnsiColor.DATA, self.username), + ] + ) + + return export diff --git a/app/lichess.py b/app/lichess.py index 39eebf8..a7d1ec5 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -3,7 +3,9 @@ import asyncio import os import os.path -from app.scraper import AnsiColor, BaseScraper, Export, Site +from app.repo import AnsiColor, Site +from app.scraper import BaseScraper +from app.exporter import BaseExporter from bs4 import BeautifulSoup from typing import List @@ -111,7 +113,14 @@ class Scraper(BaseScraper): """ filepath = self.path_coach_file(username, f"{username}.html") if os.path.isfile(filepath): - return False + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Skipping download for coach "), + (AnsiColor.DATA, username), + ] + ) + return response, _unused_status = await self.request( url=f"https://lichess.org/coach/{username}" @@ -120,9 +129,18 @@ class Scraper(BaseScraper): with open(filepath, "w") as f: f.write(response) - return True + self.log( + [ + (AnsiColor.INFO, "[INFO]"), + (None, ": Downloaded data for coach "), + (AnsiColor.DATA, username), + ] + ) - async def export(self, username: str) -> Export: - """Transform coach-specific data into uniform format.""" - export: Export = {} - return export + +class Exporter(BaseExporter): + def __init__(self, username: str): + super().__init__(site=Site.LICHESS.value, username=username) + + def export_fide_rapid(self): + return None diff --git a/app/repo.py b/app/repo.py new file mode 100644 index 0000000..abb01ef --- /dev/null +++ b/app/repo.py @@ -0,0 +1,61 @@ +import enum +import os + +from typing import List, Tuple, Union + + +class AnsiColor(enum.Enum): + ERROR = "\033[0;31m" + INFO = "\033[0;34m" + DATA = "\033[0;36m" + RESET = "\033[0m" + + +class Site(enum.Enum): + CHESSCOM = "chesscom" + LICHESS = "lichess" + + +class Repo: + """Shared filesystem-related functionality.""" + + def __init__(self, site: str): + self.site = site + + def path_site_dir(self): + """The root directory for all site-related files.""" + return os.path.join("data", self.site) + + def path_site_file(self, filename: str): + """Path to a top-level site-related file.""" + return os.path.join(self.path_site_dir(), filename) + + def path_coaches_dir(self): + """The root directory for all coach-related downloads.""" + return os.path.join(self.path_site_dir(), "coaches") + + def path_coach_dir(self, username: str): + """The root directory for a specific coach's downloads.""" + return os.path.join(self.path_coaches_dir(), username) + + def path_coach_file(self, username: str, filename: str): + """Path to a coach-specific file download.""" + return os.path.join(self.path_coach_dir(username), filename) + + def path_pages_dir(self): + """The root directory for all username listing files.""" + return os.path.join(self.path_site_dir(), "pages") + + def path_page_file(self, page_no: int): + """The root directory for usernames scraped from a single page.""" + return os.path.join(self.path_pages_dir(), f"{page_no}.txt") + + def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]): + transformed = [] + for k, v in msgs: + if k is None: + transformed.append(v) + else: + transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}") + + print("".join(transformed)) diff --git a/app/scraper.py b/app/scraper.py index 7d7727b..c19af41 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -1,38 +1,20 @@ import aiohttp -import enum -import json import os +from app.repo import Repo from typing import List, Tuple, Union -from typing_extensions import TypedDict -class Site(enum.Enum): - CHESSCOM = "chesscom" - LICHESS = "lichess" - - -class AnsiColor(enum.Enum): - ERROR = "\033[0;31m" - INFO = "\033[0;34m" - DATA = "\033[0;36m" - RESET = "\033[0m" - - -class Export(TypedDict, total=False): - fide_rapid: Union[int, None] - - -class BaseScraper: +class BaseScraper(Repo): def __init__(self, site: str, session: aiohttp.ClientSession): - """Initialize a new web scraper and exporter. + """Initialize a new web scraper. @param site: The site we are making requests out to. @param session: The `aiohttp.ClientSession` context our requests are made from. """ - self.site = site + super().__init__(site) self.session = session async def download_usernames(self) -> List[str]: @@ -43,10 +25,6 @@ class BaseScraper: """For each coach, download coach-specific data.""" raise NotImplementedError() - async def export(self, username: str) -> Export: - """Transform coach-specific data into uniform format.""" - raise NotImplementedError() - async def request(self, url: str) -> Tuple[Union[str, None], int]: """Make network requests using the internal session. @@ -61,7 +39,7 @@ class BaseScraper: return await response.text(), 200 return None, response.status - async def scrape(self): + async def scrape(self) -> List[str]: """Main entrypoint for scraping and exporting downloaded content. A `Scraper` is structured to operates in the following stages: @@ -77,43 +55,4 @@ class BaseScraper: os.makedirs(self.path_coach_dir(username), exist_ok=True) await self.download_profile(username) - export = await self.export(username) - with open(self.path_coach_file(username, "export.json"), "w") as f: - json.dump(export, f) - self.log( - [ - (AnsiColor.INFO, "[INFO]"), - (None, ": Finished exporting "), - (AnsiColor.DATA, username), - ] - ) - - def path_coaches_dir(self): - """The root directory for all coach-related downloads.""" - return os.path.join("data", self.site, "coaches") - - def path_coach_dir(self, username: str): - """The root directory for a specific coach's downloads.""" - return os.path.join(self.path_coaches_dir(), username) - - def path_coach_file(self, username: str, filename: str): - """Path to a coach-specific file download.""" - return os.path.join(self.path_coach_dir(username), filename) - - def path_pages_dir(self): - """The root directory for all username listing files.""" - return os.path.join("data", self.site, "pages") - - def path_page_file(self, page_no: int): - """The root directory for usernames scraped from a single page.""" - return os.path.join(self.path_pages_dir(), f"{page_no}.txt") - - def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]): - transformed = [] - for k, v in msgs: - if k is None: - transformed.append(v) - else: - transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}") - - print("".join(transformed)) + return usernames