coach-scraper/coach_scraper/pipeline.py

import asyncio
import logging
import os.path
from typing import Any, List, Tuple

import aiohttp
from lingua import LanguageDetector

from coach_scraper.database import Row, RowKey, upsert_row
from coach_scraper.locale import Locale
from coach_scraper.types import Site, Title


class Fetcher:
    """Download and cache files from the specified site.

    Each implementation of this class is responsible for rate-limiting requests.
    """

    def __init__(self, site: Site, session: aiohttp.ClientSession):
        self.site = site
        self.session = session
        self.has_made_request = False

        os.makedirs(self.path_coaches_dir(), exist_ok=True)
        os.makedirs(self.path_pages_dir(), exist_ok=True)

    def path_site_dir(self):
        return os.path.join("data", self.site.value)

    def path_site_file(self, filename: str):
        return os.path.join(self.path_site_dir(), filename)

    def path_coaches_dir(self):
        return os.path.join(self.path_site_dir(), "coaches")

    def path_coach_dir(self, username: str):
        return os.path.join(self.path_coaches_dir(), username)

    def path_coach_file(self, username: str, filename: str):
        return os.path.join(self.path_coach_dir(username), filename)

    def path_pages_dir(self):
        return os.path.join(self.path_site_dir(), "pages")

    def path_page_file(self, page_no: int):
        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")

    async def fetch(self, url: str) -> Tuple[str | None, int]:
        """Make network requests using the internal session.

        @param url
            The URL to make a GET request to.
        @return
            Tuple containing the response body (if the request was successful)
            and status code.
        """
        self.has_made_request = True
        async with self.session.get(url) as response:
            if response.status == 200:
                return await response.text(), 200
        logging.error(f"Could not fetch URL {url}. Status code: {response.status}")
        return None, response.status

    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        """Source the specified site for all coach usernames.

        All pages should be downloaded at `self.path_page_file()`. Any cached
        file should be a plain `.txt` file containing one username per-line.

        @param page_no:
            How many times this function was invoked (1-indexed). Useful to
            paginate responses back out to the `Pipeline` this `Downloader`
            is embedded in.
        @return:
            A list of usernames. Should return an empty list if no more
            usernames are found. Can return `None` to indicate the specified
            page should be skipped.
        """
        raise NotImplementedError()

    async def _download_user_files(self, username: str) -> None:
        os.makedirs(self.path_coach_dir(username), exist_ok=True)
        await self.download_user_files(username)

    async def download_user_files(self, username: str) -> None:
        """Source the specified site for all user-specific files.

        What files are downloaded depends on the `Downloader` implementation.
        All files should be downloaded at `self.path_coach_file()`.
        """
        raise NotImplementedError()


def _insert(row: Row, key: RowKey, value: Any):
    if value is not None:
        row[key] = value


class Extractor:
    def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str):
        self.fetcher = fetcher
        self.detector = detector
        self.username = username

    def get_name(self) -> str | None:
        raise NotImplementedError()

    def get_image_url(self) -> str | None:
        raise NotImplementedError()

    def get_title(self) -> Title | None:
        raise NotImplementedError()

    def get_languages(self) -> List[Locale] | None:
        raise NotImplementedError()

    def get_rapid(self) -> int | None:
        raise NotImplementedError()

    def get_blitz(self) -> int | None:
        raise NotImplementedError()

    def get_bullet(self) -> int | None:
        raise NotImplementedError()

    def extract(self) -> Row:
        """Extract a table row from the coach-specific downloads."""
        row: Row = {}

        _insert(row, "site", self.fetcher.site)
        _insert(row, "username", self.username)

        _insert(row, "name", self.get_name())
        _insert(row, "image_url", self.get_image_url())
        _insert(row, "title", self.get_title())
        _insert(row, "languages", self.get_languages())
        _insert(row, "rapid", self.get_rapid())
        _insert(row, "blitz", self.get_blitz())
        _insert(row, "bullet", self.get_bullet())

        return row


async def task_worker(name, queue):
    while True:
        conn, extractor = await queue.get()
        upsert_row(conn, extractor.extract())
        queue.task_done()


class Pipeline:
    """Site specific download and extraction pipeline.

    Performs downloads serially but processes data extraction from downloaded
    files concurrently.
    """

    def __init__(self, worker_count):
        self.worker_count = worker_count

    def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
        raise NotImplementedError()

    def get_extractor(
        self, fetcher: Fetcher, detector: LanguageDetector, username: str
    ) -> Extractor:
        raise NotImplementedError()

    async def process(
        self, conn, detector: LanguageDetector, session: aiohttp.ClientSession
    ):
        fetcher = self.get_fetcher(session)

        queue: asyncio.Queue = asyncio.Queue()

        # Create a batch of workers to process the jobs put into the queue.
        workers = []
        for i in range(self.worker_count):
            worker = asyncio.create_task(task_worker(f"worker-{i}", queue))
            workers.append(worker)

        # Begin downloading all coach usernames and files. The workers will
        # run concurrently to extract all the relvant information and write
        page_no = 1
        usernames: List[str] | None = [""]
        while usernames is None or len(usernames):
            usernames = await fetcher.scrape_usernames(page_no)
            page_no += 1
            for username in usernames or []:
                await fetcher._download_user_files(username)
                extractor = self.get_extractor(fetcher, detector, username)
                queue.put_nowait((conn, extractor))

        # Wait until the queue is fully processed.
        await queue.join()

        # We can now turn down the workers.
        for worker in workers:
            worker.cancel()
        await asyncio.gather(*workers, return_exceptions=True)