import asyncio import enum import os.path from typing import Any, List, Tuple, Union import aiohttp from app.database import Row, upsert_row class Site(enum.Enum): CHESSCOM = "chesscom" LICHESS = "lichess" class Fetcher: """Download and cache files from the specified site. Each implementation of this class is responsible for rate-limiting requests. """ def __init__(self, site: Site, session: aiohttp.ClientSession): self.site = site self.session = session self.has_made_request = False os.makedirs(self.path_coaches_dir(), exist_ok=True) os.makedirs(self.path_pages_dir(), exist_ok=True) def path_site_dir(self): return os.path.join("data", self.site.value) def path_site_file(self, filename: str): return os.path.join(self.path_site_dir(), filename) def path_coaches_dir(self): return os.path.join(self.path_site_dir(), "coaches") def path_coach_dir(self, username: str): return os.path.join(self.path_coaches_dir(), username) def path_coach_file(self, username: str, filename: str): return os.path.join(self.path_coach_dir(username), filename) def path_pages_dir(self): return os.path.join(self.path_site_dir(), "pages") def path_page_file(self, page_no: int): return os.path.join(self.path_pages_dir(), f"{page_no}.txt") async def fetch(self, url: str) -> Tuple[Union[str, None], int]: """Make network requests using the internal session. @param url The URL to make a GET request to. @return Tuple containing the response body (if the request was successful) and status code. """ self.has_made_request = True async with self.session.get(url) as response: if response.status == 200: return await response.text(), 200 return None, response.status async def scrape_usernames(self, page_no: int) -> Union[List[str], None]: """Source the specified site for all coach usernames. All pages should be downloaded at `self.path_page_file()`. Any cached file should be a plain `.txt` file containing one username per-line. @param page_no: How many times this function was invoked (1-indexed). Useful to paginate responses back out to the `Pipeline` this `Downloader` is embedded in. @return: A list of usernames. Should return an empty list if no more usernames are found. Can return `None` to indicate the specified page should be skipped. """ raise NotImplementedError() async def _download_user_files(self, username: str) -> None: os.makedirs(self.path_coach_dir(username), exist_ok=True) await self.download_user_files(username) async def download_user_files(self, username: str) -> None: """Source the specified site for all user-specific files. What files are downloaded depends on the `Downloader` implementation. All files should be downloaded at `self.path_coach_file()`. """ raise NotImplementedError() def _insert(row: Row, key: str, value: Any): if value is not None: row[key] = value class Extractor: def __init__(self, fetcher: Fetcher, username: str): self.fetcher = fetcher self.username = username def get_name(self) -> Union[str, None]: raise NotImplementedError() def get_image_url(self) -> Union[str, None]: raise NotImplementedError() def get_rapid(self) -> Union[int, None]: raise NotImplementedError() def get_blitz(self) -> Union[int, None]: raise NotImplementedError() def get_bullet(self) -> Union[int, None]: raise NotImplementedError() def extract(self) -> Row: """Extract a table row from the coach-specific downloads.""" row: Row = {} _insert(row, "site", self.fetcher.site) _insert(row, "username", self.username) _insert(row, "name", self.get_name()) _insert(row, "image_url", self.get_image_url()) _insert(row, "rapid", self.get_rapid()) _insert(row, "blitz", self.get_blitz()) _insert(row, "bullet", self.get_bullet()) return row async def task_worker(name, queue): while True: conn, extractor = await queue.get() upsert_row(conn, extractor.extract()) queue.task_done() class Pipeline: """Site specific download and extraction pipeline. Performs downloads serially but processes data extraction from downloaded files concurrently. """ def __init__(self, worker_count): self.worker_count = worker_count def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher: raise NotImplementedError() def get_extractor(self, fetcher: Fetcher, username: str) -> Extractor: raise NotImplementedError() async def process(self, conn, session: aiohttp.ClientSession): fetcher = self.get_fetcher(session) queue = asyncio.Queue() # Create a batch of workers to process the jobs put into the queue. workers = [] for i in range(self.worker_count): worker = asyncio.create_task(task_worker(f"worker-{i}", queue)) workers.append(worker) # Begin downloading all coach usernames and files. The workers will # run concurrently to extract all the relvant information and write page_no = 1 usernames = [None] while len(usernames): usernames = await fetcher.scrape_usernames(page_no) page_no += 1 if usernames is None: usernames = [None] continue for username in usernames: await fetcher._download_user_files(username) extractor = self.get_extractor(fetcher, username) queue.put_nowait((conn, extractor)) # Wait until the queue is fully processed. await queue.join() # We can now turn down the workers. for worker in workers: worker.cancel() await asyncio.gather(*workers, return_exceptions=True)