coach-scraper/app/pipeline.py

import asyncio
import enum
import os.path
from typing import Any, List, Tuple, Union

import aiohttp

from app.database import Row, upsert_row


class Site(enum.Enum):
    CHESSCOM = "chesscom"
    LICHESS = "lichess"


class Fetcher:
    """Download and cache files from the specified site.

    Each implementation of this class is responsible for rate-limiting requests.
    """

    def __init__(self, site: Site, session: aiohttp.ClientSession):
        self.site = site
        self.session = session
        self.has_made_request = False

        os.makedirs(self.path_coaches_dir(), exist_ok=True)
        os.makedirs(self.path_pages_dir(), exist_ok=True)

    def path_site_dir(self):
        return os.path.join("data", self.site.value)

    def path_site_file(self, filename: str):
        return os.path.join(self.path_site_dir(), filename)

    def path_coaches_dir(self):
        return os.path.join(self.path_site_dir(), "coaches")

    def path_coach_dir(self, username: str):
        return os.path.join(self.path_coaches_dir(), username)

    def path_coach_file(self, username: str, filename: str):
        return os.path.join(self.path_coach_dir(username), filename)

    def path_pages_dir(self):
        return os.path.join(self.path_site_dir(), "pages")

    def path_page_file(self, page_no: int):
        return os.path.join(self.path_pages_dir(), f"{page_no}.txt")

    async def fetch(self, url: str) -> Tuple[Union[str, None], int]:
        """Make network requests using the internal session.

        @param url
            The URL to make a GET request to.
        @return
            Tuple containing the response body (if the request was successful)
            and status code.
        """
        self.has_made_request = True
        async with self.session.get(url) as response:
            if response.status == 200:
                return await response.text(), 200
        return None, response.status

    async def scrape_usernames(self, page_no: int) -> Union[List[str], None]:
        """Source the specified site for all coach usernames.

        All pages should be downloaded at `self.path_page_file()`. Any cached
        file should be a plain `.txt` file containing one username per-line.

        @param page_no:
            How many times this function was invoked (1-indexed). Useful to
            paginate responses back out to the `Pipeline` this `Downloader`
            is embedded in.
        @return:
            A list of usernames. Should return an empty list if no more
            usernames are found. Can return `None` to indicate the specified
            page should be skipped.
        """
        raise NotImplementedError()

    async def _download_user_files(self, username: str) -> None:
        os.makedirs(self.path_coach_dir(username), exist_ok=True)
        await self.download_user_files(username)

    async def download_user_files(self, username: str) -> None:
        """Source the specified site for all user-specific files.

        What files are downloaded depends on the `Downloader` implementation.
        All files should be downloaded at `self.path_coach_file()`.
        """
        raise NotImplementedError()


def _insert(row: Row, key: str, value: Any):
    if value is not None:
        row[key] = value


class Extractor:
    def __init__(self, fetcher: Fetcher, username: str):
        self.fetcher = fetcher
        self.username = username

    def get_name(self) -> Union[str, None]:
        raise NotImplementedError()

    def get_image_url(self) -> Union[str, None]:
        raise NotImplementedError()

    def get_rapid(self) -> Union[int, None]:
        raise NotImplementedError()

    def get_blitz(self) -> Union[int, None]:
        raise NotImplementedError()

    def get_bullet(self) -> Union[int, None]:
        raise NotImplementedError()

    def extract(self) -> Row:
        """Extract a table row from the coach-specific downloads."""
        row: Row = {}

        _insert(row, "site", self.fetcher.site)
        _insert(row, "username", self.username)

        _insert(row, "name", self.get_name())
        _insert(row, "image_url", self.get_image_url())
        _insert(row, "rapid", self.get_rapid())
        _insert(row, "blitz", self.get_blitz())
        _insert(row, "bullet", self.get_bullet())

        return row


async def task_worker(name, queue):
    while True:
        conn, extractor = await queue.get()
        upsert_row(conn, extractor.extract())
        queue.task_done()


class Pipeline:
    """Site specific download and extraction pipeline.

    Performs downloads serially but processes data extraction from downloaded
    files concurrently.
    """

    def __init__(self, worker_count):
        self.worker_count = worker_count

    def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
        raise NotImplementedError()

    def get_extractor(self, fetcher: Fetcher, username: str) -> Extractor:
        raise NotImplementedError()

    async def process(self, conn, session: aiohttp.ClientSession):
        fetcher = self.get_fetcher(session)

        queue = asyncio.Queue()

        # Create a batch of workers to process the jobs put into the queue.
        workers = []
        for i in range(self.worker_count):
            worker = asyncio.create_task(task_worker(f"worker-{i}", queue))
            workers.append(worker)

        # Begin downloading all coach usernames and files. The workers will
        # run concurrently to extract all the relvant information and write
        page_no = 1
        usernames = [None]
        while len(usernames):
            usernames = await fetcher.scrape_usernames(page_no)
            page_no += 1
            if usernames is None:
                usernames = [None]
                continue
            for username in usernames:
                await fetcher._download_user_files(username)
                extractor = self.get_extractor(fetcher, username)
                queue.put_nowait((conn, extractor))

        # Wait until the queue is fully processed.
        await queue.join()

        # We can now turn down the workers.
        for worker in workers:
            worker.cancel()
        await asyncio.gather(*workers, return_exceptions=True)
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`import asyncio`
			`import enum`
			`import os.path`
			`from typing import Any, List, Tuple, Union`

			`import aiohttp`

			`from app.database import Row, upsert_row`


			`class Site(enum.Enum):`
			`CHESSCOM = "chesscom"`
			`LICHESS = "lichess"`


			`class Fetcher:`
			`"""Download and cache files from the specified site.`

			`Each implementation of this class is responsible for rate-limiting requests.`
			`"""`

			`def __init__(self, site: Site, session: aiohttp.ClientSession):`
			`self.site = site`
			`self.session = session`
			`self.has_made_request = False`

			`os.makedirs(self.path_coaches_dir(), exist_ok=True)`
			`os.makedirs(self.path_pages_dir(), exist_ok=True)`

			`def path_site_dir(self):`
			`return os.path.join("data", self.site.value)`

			`def path_site_file(self, filename: str):`
			`return os.path.join(self.path_site_dir(), filename)`

			`def path_coaches_dir(self):`
			`return os.path.join(self.path_site_dir(), "coaches")`

			`def path_coach_dir(self, username: str):`
			`return os.path.join(self.path_coaches_dir(), username)`

			`def path_coach_file(self, username: str, filename: str):`
			`return os.path.join(self.path_coach_dir(username), filename)`

			`def path_pages_dir(self):`
			`return os.path.join(self.path_site_dir(), "pages")`

			`def path_page_file(self, page_no: int):`
			`return os.path.join(self.path_pages_dir(), f"{page_no}.txt")`

			`async def fetch(self, url: str) -> Tuple[Union[str, None], int]:`
			`"""Make network requests using the internal session.`

			`@param url`
			`The URL to make a GET request to.`
			`@return`
			`Tuple containing the response body (if the request was successful)`
			`and status code.`
			`"""`
			`self.has_made_request = True`
			`async with self.session.get(url) as response:`
			`if response.status == 200:`
			`return await response.text(), 200`
			`return None, response.status`

			`async def scrape_usernames(self, page_no: int) -> Union[List[str], None]:`
			`"""Source the specified site for all coach usernames.`

			All pages should be downloaded at `self.path_page_file()`. Any cached
			file should be a plain `.txt` file containing one username per-line.

			`@param page_no:`
			`How many times this function was invoked (1-indexed). Useful to`
			paginate responses back out to the `Pipeline` this `Downloader`
			`is embedded in.`
			`@return:`
			`A list of usernames. Should return an empty list if no more`
			usernames are found. Can return `None` to indicate the specified
			`page should be skipped.`
			`"""`
			`raise NotImplementedError()`

			`async def _download_user_files(self, username: str) -> None:`
			`os.makedirs(self.path_coach_dir(username), exist_ok=True)`
			`await self.download_user_files(username)`

			`async def download_user_files(self, username: str) -> None:`
			`"""Source the specified site for all user-specific files.`

			What files are downloaded depends on the `Downloader` implementation.
			All files should be downloaded at `self.path_coach_file()`.
			`"""`
			`raise NotImplementedError()`


			`def _insert(row: Row, key: str, value: Any):`
			`if value is not None:`
			`row[key] = value`


			`class Extractor:`
			`def __init__(self, fetcher: Fetcher, username: str):`
			`self.fetcher = fetcher`
			`self.username = username`

			`def get_name(self) -> Union[str, None]:`
			`raise NotImplementedError()`

			`def get_image_url(self) -> Union[str, None]:`
			`raise NotImplementedError()`

			`def get_rapid(self) -> Union[int, None]:`
			`raise NotImplementedError()`

			`def get_blitz(self) -> Union[int, None]:`
			`raise NotImplementedError()`

			`def get_bullet(self) -> Union[int, None]:`
			`raise NotImplementedError()`

			`def extract(self) -> Row:`
			`"""Extract a table row from the coach-specific downloads."""`
			`row: Row = {}`

			`_insert(row, "site", self.fetcher.site)`
			`_insert(row, "username", self.username)`

			`_insert(row, "name", self.get_name())`
			`_insert(row, "image_url", self.get_image_url())`
			`_insert(row, "rapid", self.get_rapid())`
			`_insert(row, "blitz", self.get_blitz())`
			`_insert(row, "bullet", self.get_bullet())`

			`return row`


			`async def task_worker(name, queue):`
			`while True:`
			`conn, extractor = await queue.get()`
			`upsert_row(conn, extractor.extract())`
			`queue.task_done()`


			`class Pipeline:`
			`"""Site specific download and extraction pipeline.`

			`Performs downloads serially but processes data extraction from downloaded`
			`files concurrently.`
			`"""`

			`def __init__(self, worker_count):`
			`self.worker_count = worker_count`

			`def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:`
			`raise NotImplementedError()`

			`def get_extractor(self, fetcher: Fetcher, username: str) -> Extractor:`
			`raise NotImplementedError()`

			`async def process(self, conn, session: aiohttp.ClientSession):`
			`fetcher = self.get_fetcher(session)`

			`queue = asyncio.Queue()`

			`# Create a batch of workers to process the jobs put into the queue.`
			`workers = []`
			`for i in range(self.worker_count):`
			`worker = asyncio.create_task(task_worker(f"worker-{i}", queue))`
			`workers.append(worker)`

			`# Begin downloading all coach usernames and files. The workers will`
			`# run concurrently to extract all the relvant information and write`
			`page_no = 1`
			`usernames = [None]`
			`while len(usernames):`
			`usernames = await fetcher.scrape_usernames(page_no)`
			`page_no += 1`
			`if usernames is None:`
			`usernames = [None]`
			`continue`
			`for username in usernames:`
			`await fetcher._download_user_files(username)`
			`extractor = self.get_extractor(fetcher, username)`
			`queue.put_nowait((conn, extractor))`

			`# Wait until the queue is fully processed.`
			`await queue.join()`

			`# We can now turn down the workers.`
			`for worker in workers:`
			`worker.cancel()`
			`await asyncio.gather(*workers, return_exceptions=True)`