coach-scraper/coach_scraper/pipeline.py

import asyncio
import logging
import os.path
from typing import Any, List, Tuple

import aiohttp
from lingua import LanguageDetector

from coach_scraper.database import Row, RowKey, upsert_row
from coach_scraper.locale import Locale
from coach_scraper.types import Site, Title


class Fetcher:
    """Download and cache files from the specified site.

    Each implementation of this class is responsible for rate-limiting requests.
    """

    def __init__(self, site: Site, session: aiohttp.ClientSession):
        self.site = site
        self.session = session
        self.has_made_request = False

        os.makedirs(self.path_coaches_dir(), exist_ok=True)
        os.makedirs(self.path_pages_dir(), exist_ok=True)

    def path_site_dir(self):
        return os.path.join("data", self.site.value)

    def path_site_file(self, filename: str):
        return os.path.join(self.path_site_dir(), filename)

    def path_coaches_dir(self):
        return os.path.join(self.path_site_dir(), "coaches")

    def path_coach_dir(self, username: str):
        return os.path.join(self.path_coaches_dir(), username)

    def path_coach_file(self, username: str, filename: str):
        return os.path.join(self.path_coach_dir(username), filename)

    def path_pages_dir(self):
        return os.path.join(self.path_site_dir(), "pages")

    def path_page_file(self, page_no: int):
        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")

    async def fetch(self, url: str) -> Tuple[str | None, int]:
        """Make network requests using the internal session.

        @param url
            The URL to make a GET request to.
        @return
            Tuple containing the response body (if the request was successful)
            and status code.
        """
        self.has_made_request = True
        async with self.session.get(url) as response:
            if response.status == 200:
                return await response.text(), 200
        logging.error(f"Could not fetch URL {url}. Status code: {response.status}")
        return None, response.status

    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        """Source the specified site for all coach usernames.

        All pages should be downloaded at `self.path_page_file()`. Any cached
        file should be a plain `.txt` file containing one username per-line.

        @param page_no:
            How many times this function was invoked (1-indexed). Useful to
            paginate responses back out to the `Pipeline` this `Downloader`
            is embedded in.
        @return:
            A list of usernames. Should return an empty list if no more
            usernames are found. Can return `None` to indicate the specified
            page should be skipped.
        """
        raise NotImplementedError()

    async def _download_user_files(self, username: str) -> None:
        os.makedirs(self.path_coach_dir(username), exist_ok=True)
        await self.download_user_files(username)

    async def download_user_files(self, username: str) -> None:
        """Source the specified site for all user-specific files.

        What files are downloaded depends on the `Downloader` implementation.
        All files should be downloaded at `self.path_coach_file()`.
        """
        raise NotImplementedError()


def _insert(row: Row, key: RowKey, value: Any):
    if value is not None:
        row[key] = value


class Extractor:
    def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str):
        self.fetcher = fetcher
        self.detector = detector
        self.username = username

    def get_name(self) -> str | None:
        raise NotImplementedError()

    def get_image_url(self) -> str | None:
        raise NotImplementedError()

    def get_title(self) -> Title | None:
        raise NotImplementedError()

    def get_languages(self) -> List[Locale] | None:
        raise NotImplementedError()

    def get_rapid(self) -> int | None:
        raise NotImplementedError()

    def get_blitz(self) -> int | None:
        raise NotImplementedError()

    def get_bullet(self) -> int | None:
        raise NotImplementedError()

    def extract(self) -> Row:
        """Extract a table row from the coach-specific downloads."""
        row: Row = {}

        _insert(row, "site", self.fetcher.site)
        _insert(row, "username", self.username)

        _insert(row, "name", self.get_name())
        _insert(row, "image_url", self.get_image_url())
        _insert(row, "title", self.get_title())
        _insert(row, "languages", self.get_languages())
        _insert(row, "rapid", self.get_rapid())
        _insert(row, "blitz", self.get_blitz())
        _insert(row, "bullet", self.get_bullet())

        return row


async def task_worker(name, queue):
    while True:
        conn, extractor = await queue.get()
        upsert_row(conn, extractor.extract())
        queue.task_done()


class Pipeline:
    """Site specific download and extraction pipeline.

    Performs downloads serially but processes data extraction from downloaded
    files concurrently.
    """

    def __init__(self, worker_count):
        self.worker_count = worker_count

    def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
        raise NotImplementedError()

    def get_extractor(
        self, fetcher: Fetcher, detector: LanguageDetector, username: str
    ) -> Extractor:
        raise NotImplementedError()

    async def process(
        self, conn, detector: LanguageDetector, session: aiohttp.ClientSession
    ):
        fetcher = self.get_fetcher(session)

        queue: asyncio.Queue = asyncio.Queue()

        # Create a batch of workers to process the jobs put into the queue.
        workers = []
        for i in range(self.worker_count):
            worker = asyncio.create_task(task_worker(f"worker-{i}", queue))
            workers.append(worker)

        # Begin downloading all coach usernames and files. The workers will
        # run concurrently to extract all the relvant information and write
        page_no = 1
        usernames: List[str] | None = [""]
        while usernames is None or len(usernames):
            usernames = await fetcher.scrape_usernames(page_no)
            page_no += 1
            for username in usernames or []:
                await fetcher._download_user_files(username)
                extractor = self.get_extractor(fetcher, detector, username)
                queue.put_nowait((conn, extractor))

        # Wait until the queue is fully processed.
        await queue.join()

        # We can now turn down the workers.
        for worker in workers:
            worker.cancel()
        await asyncio.gather(*workers, return_exceptions=True)
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`import asyncio`
Add logging around failed network requests. 2023-12-09 22:39:56 +00:00			`import logging`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`import os.path`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`from typing import Any, List, Tuple`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00
			`import aiohttp`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`from lingua import LanguageDetector`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00
Update app name and version. 2023-12-09 00:48:54 +00:00			`from coach_scraper.database import Row, RowKey, upsert_row`
			`from coach_scraper.locale import Locale`
			`from coach_scraper.types import Site, Title`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00

			`class Fetcher:`
			`"""Download and cache files from the specified site.`

			`Each implementation of this class is responsible for rate-limiting requests.`
			`"""`

			`def __init__(self, site: Site, session: aiohttp.ClientSession):`
			`self.site = site`
			`self.session = session`
			`self.has_made_request = False`

			`os.makedirs(self.path_coaches_dir(), exist_ok=True)`
			`os.makedirs(self.path_pages_dir(), exist_ok=True)`

			`def path_site_dir(self):`
			`return os.path.join("data", self.site.value)`

			`def path_site_file(self, filename: str):`
			`return os.path.join(self.path_site_dir(), filename)`

			`def path_coaches_dir(self):`
			`return os.path.join(self.path_site_dir(), "coaches")`

			`def path_coach_dir(self, username: str):`
			`return os.path.join(self.path_coaches_dir(), username)`

			`def path_coach_file(self, username: str, filename: str):`
			`return os.path.join(self.path_coach_dir(username), filename)`

			`def path_pages_dir(self):`
			`return os.path.join(self.path_site_dir(), "pages")`

			`def path_page_file(self, page_no: int):`
			`return os.path.join(self.path_pages_dir(), f"{page_no}.txt")`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`async def fetch(self, url: str) -> Tuple[str \| None, int]:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`"""Make network requests using the internal session.`

			`@param url`
			`The URL to make a GET request to.`
			`@return`
			`Tuple containing the response body (if the request was successful)`
			`and status code.`
			`"""`
			`self.has_made_request = True`
			`async with self.session.get(url) as response:`
			`if response.status == 200:`
			`return await response.text(), 200`
Add logging around failed network requests. 2023-12-09 22:39:56 +00:00			`logging.error(f"Could not fetch URL {url}. Status code: {response.status}")`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`return None, response.status`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`async def scrape_usernames(self, page_no: int) -> List[str] \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`"""Source the specified site for all coach usernames.`

			All pages should be downloaded at `self.path_page_file()`. Any cached
			file should be a plain `.txt` file containing one username per-line.

			`@param page_no:`
			`How many times this function was invoked (1-indexed). Useful to`
			paginate responses back out to the `Pipeline` this `Downloader`
			`is embedded in.`
			`@return:`
			`A list of usernames. Should return an empty list if no more`
			usernames are found. Can return `None` to indicate the specified
			`page should be skipped.`
			`"""`
			`raise NotImplementedError()`

			`async def _download_user_files(self, username: str) -> None:`
			`os.makedirs(self.path_coach_dir(username), exist_ok=True)`
			`await self.download_user_files(username)`

			`async def download_user_files(self, username: str) -> None:`
			`"""Source the specified site for all user-specific files.`

			What files are downloaded depends on the `Downloader` implementation.
			All files should be downloaded at `self.path_coach_file()`.
			`"""`
			`raise NotImplementedError()`


Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def _insert(row: Row, key: RowKey, value: Any):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`if value is not None:`
			`row[key] = value`


			`class Extractor:`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`self.fetcher = fetcher`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`self.detector = detector`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`self.username = username`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_name(self) -> str \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`raise NotImplementedError()`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_image_url(self) -> str \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`raise NotImplementedError()`

Scrape titles. 2023-12-07 02:52:40 +00:00			`def get_title(self) -> Title \| None:`
			`raise NotImplementedError()`

Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`def get_languages(self) -> List[Locale] \| None:`
Scrape languages from lichess listing. (#10) 2023-12-05 21:20:46 +00:00			`raise NotImplementedError()`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_rapid(self) -> int \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`raise NotImplementedError()`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_blitz(self) -> int \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`raise NotImplementedError()`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_bullet(self) -> int \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`raise NotImplementedError()`

			`def extract(self) -> Row:`
			`"""Extract a table row from the coach-specific downloads."""`
			`row: Row = {}`

			`_insert(row, "site", self.fetcher.site)`
			`_insert(row, "username", self.username)`

			`_insert(row, "name", self.get_name())`
			`_insert(row, "image_url", self.get_image_url())`
Scrape titles. 2023-12-07 02:52:40 +00:00			`_insert(row, "title", self.get_title())`
Scrape languages from lichess listing. (#10) 2023-12-05 21:20:46 +00:00			`_insert(row, "languages", self.get_languages())`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`_insert(row, "rapid", self.get_rapid())`
			`_insert(row, "blitz", self.get_blitz())`
			`_insert(row, "bullet", self.get_bullet())`

			`return row`


			`async def task_worker(name, queue):`
			`while True:`
			`conn, extractor = await queue.get()`
			`upsert_row(conn, extractor.extract())`
			`queue.task_done()`


			`class Pipeline:`
			`"""Site specific download and extraction pipeline.`

			`Performs downloads serially but processes data extraction from downloaded`
			`files concurrently.`
			`"""`

			`def __init__(self, worker_count):`
			`self.worker_count = worker_count`

			`def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:`
			`raise NotImplementedError()`

Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`def get_extractor(`
			`self, fetcher: Fetcher, detector: LanguageDetector, username: str`
			`) -> Extractor:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`raise NotImplementedError()`

Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`async def process(`
			`self, conn, detector: LanguageDetector, session: aiohttp.ClientSession`
			`):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`fetcher = self.get_fetcher(session)`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`queue: asyncio.Queue = asyncio.Queue()`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00
			`# Create a batch of workers to process the jobs put into the queue.`
			`workers = []`
			`for i in range(self.worker_count):`
			`worker = asyncio.create_task(task_worker(f"worker-{i}", queue))`
			`workers.append(worker)`

			`# Begin downloading all coach usernames and files. The workers will`
			`# run concurrently to extract all the relvant information and write`
			`page_no = 1`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`usernames: List[str] \| None = [""]`
			`while usernames is None or len(usernames):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`usernames = await fetcher.scrape_usernames(page_no)`
			`page_no += 1`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`for username in usernames or []:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`await fetcher._download_user_files(username)`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`extractor = self.get_extractor(fetcher, detector, username)`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`queue.put_nowait((conn, extractor))`

			`# Wait until the queue is fully processed.`
			`await queue.join()`

			`# We can now turn down the workers.`
			`for worker in workers:`
			`worker.cancel()`
			`await asyncio.gather(*workers, return_exceptions=True)`