2023-12-05 18:43:13 +00:00
|
|
|
import asyncio
|
2023-12-09 22:39:56 +00:00
|
|
|
import logging
|
2023-12-05 18:43:13 +00:00
|
|
|
import os.path
|
2023-12-05 19:54:12 +00:00
|
|
|
from typing import Any, List, Tuple
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
import aiohttp
|
2023-12-07 03:53:54 +00:00
|
|
|
from lingua import LanguageDetector
|
2023-12-05 18:43:13 +00:00
|
|
|
|
2023-12-09 00:48:54 +00:00
|
|
|
from coach_scraper.database import Row, RowKey, upsert_row
|
|
|
|
from coach_scraper.locale import Locale
|
|
|
|
from coach_scraper.types import Site, Title
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Fetcher:
|
|
|
|
"""Download and cache files from the specified site.
|
|
|
|
|
|
|
|
Each implementation of this class is responsible for rate-limiting requests.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, site: Site, session: aiohttp.ClientSession):
|
|
|
|
self.site = site
|
|
|
|
self.session = session
|
|
|
|
self.has_made_request = False
|
|
|
|
|
|
|
|
os.makedirs(self.path_coaches_dir(), exist_ok=True)
|
|
|
|
os.makedirs(self.path_pages_dir(), exist_ok=True)
|
|
|
|
|
|
|
|
def path_site_dir(self):
|
|
|
|
return os.path.join("data", self.site.value)
|
|
|
|
|
|
|
|
def path_site_file(self, filename: str):
|
|
|
|
return os.path.join(self.path_site_dir(), filename)
|
|
|
|
|
|
|
|
def path_coaches_dir(self):
|
|
|
|
return os.path.join(self.path_site_dir(), "coaches")
|
|
|
|
|
|
|
|
def path_coach_dir(self, username: str):
|
|
|
|
return os.path.join(self.path_coaches_dir(), username)
|
|
|
|
|
|
|
|
def path_coach_file(self, username: str, filename: str):
|
|
|
|
return os.path.join(self.path_coach_dir(username), filename)
|
|
|
|
|
|
|
|
def path_pages_dir(self):
|
|
|
|
return os.path.join(self.path_site_dir(), "pages")
|
|
|
|
|
|
|
|
def path_page_file(self, page_no: int):
|
|
|
|
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
async def fetch(self, url: str) -> Tuple[str | None, int]:
|
2023-12-05 18:43:13 +00:00
|
|
|
"""Make network requests using the internal session.
|
|
|
|
|
|
|
|
@param url
|
|
|
|
The URL to make a GET request to.
|
|
|
|
@return
|
|
|
|
Tuple containing the response body (if the request was successful)
|
|
|
|
and status code.
|
|
|
|
"""
|
|
|
|
self.has_made_request = True
|
|
|
|
async with self.session.get(url) as response:
|
|
|
|
if response.status == 200:
|
|
|
|
return await response.text(), 200
|
2023-12-09 22:39:56 +00:00
|
|
|
logging.error(f"Could not fetch URL {url}. Status code: {response.status}")
|
2023-12-05 18:43:13 +00:00
|
|
|
return None, response.status
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
async def scrape_usernames(self, page_no: int) -> List[str] | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
"""Source the specified site for all coach usernames.
|
|
|
|
|
|
|
|
All pages should be downloaded at `self.path_page_file()`. Any cached
|
|
|
|
file should be a plain `.txt` file containing one username per-line.
|
|
|
|
|
|
|
|
@param page_no:
|
|
|
|
How many times this function was invoked (1-indexed). Useful to
|
|
|
|
paginate responses back out to the `Pipeline` this `Downloader`
|
|
|
|
is embedded in.
|
|
|
|
@return:
|
|
|
|
A list of usernames. Should return an empty list if no more
|
|
|
|
usernames are found. Can return `None` to indicate the specified
|
|
|
|
page should be skipped.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
async def _download_user_files(self, username: str) -> None:
|
|
|
|
os.makedirs(self.path_coach_dir(username), exist_ok=True)
|
|
|
|
await self.download_user_files(username)
|
|
|
|
|
|
|
|
async def download_user_files(self, username: str) -> None:
|
|
|
|
"""Source the specified site for all user-specific files.
|
|
|
|
|
|
|
|
What files are downloaded depends on the `Downloader` implementation.
|
|
|
|
All files should be downloaded at `self.path_coach_file()`.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def _insert(row: Row, key: RowKey, value: Any):
|
2023-12-05 18:43:13 +00:00
|
|
|
if value is not None:
|
|
|
|
row[key] = value
|
|
|
|
|
|
|
|
|
|
|
|
class Extractor:
|
2023-12-07 03:53:54 +00:00
|
|
|
def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str):
|
2023-12-05 18:43:13 +00:00
|
|
|
self.fetcher = fetcher
|
2023-12-07 03:53:54 +00:00
|
|
|
self.detector = detector
|
2023-12-05 18:43:13 +00:00
|
|
|
self.username = username
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_name(self) -> str | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_image_url(self) -> str | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-07 02:52:40 +00:00
|
|
|
def get_title(self) -> Title | None:
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
def get_languages(self) -> List[Locale] | None:
|
2023-12-05 21:20:46 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_rapid(self) -> int | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_blitz(self) -> int | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_bullet(self) -> int | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
def extract(self) -> Row:
|
|
|
|
"""Extract a table row from the coach-specific downloads."""
|
|
|
|
row: Row = {}
|
|
|
|
|
|
|
|
_insert(row, "site", self.fetcher.site)
|
|
|
|
_insert(row, "username", self.username)
|
|
|
|
|
|
|
|
_insert(row, "name", self.get_name())
|
|
|
|
_insert(row, "image_url", self.get_image_url())
|
2023-12-07 02:52:40 +00:00
|
|
|
_insert(row, "title", self.get_title())
|
2023-12-05 21:20:46 +00:00
|
|
|
_insert(row, "languages", self.get_languages())
|
2023-12-05 18:43:13 +00:00
|
|
|
_insert(row, "rapid", self.get_rapid())
|
|
|
|
_insert(row, "blitz", self.get_blitz())
|
|
|
|
_insert(row, "bullet", self.get_bullet())
|
|
|
|
|
|
|
|
return row
|
|
|
|
|
|
|
|
|
|
|
|
async def task_worker(name, queue):
|
|
|
|
while True:
|
|
|
|
conn, extractor = await queue.get()
|
|
|
|
upsert_row(conn, extractor.extract())
|
|
|
|
queue.task_done()
|
|
|
|
|
|
|
|
|
|
|
|
class Pipeline:
|
|
|
|
"""Site specific download and extraction pipeline.
|
|
|
|
|
|
|
|
Performs downloads serially but processes data extraction from downloaded
|
|
|
|
files concurrently.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, worker_count):
|
|
|
|
self.worker_count = worker_count
|
|
|
|
|
|
|
|
def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
def get_extractor(
|
|
|
|
self, fetcher: Fetcher, detector: LanguageDetector, username: str
|
|
|
|
) -> Extractor:
|
2023-12-05 18:43:13 +00:00
|
|
|
raise NotImplementedError()
|
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
async def process(
|
|
|
|
self, conn, detector: LanguageDetector, session: aiohttp.ClientSession
|
|
|
|
):
|
2023-12-05 18:43:13 +00:00
|
|
|
fetcher = self.get_fetcher(session)
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
queue: asyncio.Queue = asyncio.Queue()
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
# Create a batch of workers to process the jobs put into the queue.
|
|
|
|
workers = []
|
|
|
|
for i in range(self.worker_count):
|
|
|
|
worker = asyncio.create_task(task_worker(f"worker-{i}", queue))
|
|
|
|
workers.append(worker)
|
|
|
|
|
|
|
|
# Begin downloading all coach usernames and files. The workers will
|
|
|
|
# run concurrently to extract all the relvant information and write
|
|
|
|
page_no = 1
|
2023-12-05 19:54:12 +00:00
|
|
|
usernames: List[str] | None = [""]
|
|
|
|
while usernames is None or len(usernames):
|
2023-12-05 18:43:13 +00:00
|
|
|
usernames = await fetcher.scrape_usernames(page_no)
|
|
|
|
page_no += 1
|
2023-12-05 19:54:12 +00:00
|
|
|
for username in usernames or []:
|
2023-12-05 18:43:13 +00:00
|
|
|
await fetcher._download_user_files(username)
|
2023-12-07 03:53:54 +00:00
|
|
|
extractor = self.get_extractor(fetcher, detector, username)
|
2023-12-05 18:43:13 +00:00
|
|
|
queue.put_nowait((conn, extractor))
|
|
|
|
|
|
|
|
# Wait until the queue is fully processed.
|
|
|
|
await queue.join()
|
|
|
|
|
|
|
|
# We can now turn down the workers.
|
|
|
|
for worker in workers:
|
|
|
|
worker.cancel()
|
|
|
|
await asyncio.gather(*workers, return_exceptions=True)
|