2023-11-27 20:09:40 +00:00
|
|
|
import aiohttp
|
2023-11-30 22:15:15 +00:00
|
|
|
import enum
|
|
|
|
import json
|
2023-11-27 20:09:40 +00:00
|
|
|
import os
|
2023-11-30 22:15:15 +00:00
|
|
|
|
|
|
|
from typing import List, Tuple, Union
|
|
|
|
from typing_extensions import TypedDict
|
|
|
|
|
|
|
|
|
|
|
|
class Site(enum.Enum):
|
|
|
|
CHESSCOM = "chesscom"
|
|
|
|
LICHESS = "lichess"
|
|
|
|
|
|
|
|
|
|
|
|
class AnsiColor(enum.Enum):
|
|
|
|
ERROR = "\033[0;31m"
|
|
|
|
INFO = "\033[0;34m"
|
|
|
|
DATA = "\033[0;36m"
|
|
|
|
RESET = "\033[0m"
|
|
|
|
|
|
|
|
|
|
|
|
class Export(TypedDict, total=False):
|
|
|
|
fide_rapid: Union[int, None]
|
|
|
|
|
|
|
|
|
|
|
|
class BaseScraper:
|
|
|
|
def __init__(self, site: str, session: aiohttp.ClientSession):
|
|
|
|
"""Initialize a new web scraper and exporter.
|
|
|
|
|
|
|
|
@param site:
|
|
|
|
The site we are making requests out to.
|
|
|
|
@param session:
|
|
|
|
The `aiohttp.ClientSession` context our requests are made from.
|
|
|
|
"""
|
|
|
|
self.site = site
|
|
|
|
self.session = session
|
|
|
|
|
|
|
|
async def download_usernames(self) -> List[str]:
|
|
|
|
"""Collect all coach usernames from the specified site."""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
async def download_profile(self, username: str):
|
|
|
|
"""For each coach, download coach-specific data."""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
async def export(self, username: str) -> Export:
|
|
|
|
"""Transform coach-specific data into uniform format."""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
async def request(self, url: str) -> Tuple[Union[str, None], int]:
|
|
|
|
"""Make network requests using the internal session.
|
|
|
|
|
|
|
|
@param url
|
|
|
|
The URL to make a GET request to.
|
|
|
|
@return
|
|
|
|
Tuple containing the response body (if the request was successful)
|
|
|
|
and status code.
|
|
|
|
"""
|
|
|
|
async with self.session.get(url) as response:
|
|
|
|
if response.status == 200:
|
|
|
|
return await response.text(), 200
|
|
|
|
return None, response.status
|
|
|
|
|
|
|
|
async def scrape(self):
|
|
|
|
"""Main entrypoint for scraping and exporting downloaded content.
|
|
|
|
|
|
|
|
A `Scraper` is structured to operates in the following stages:
|
|
|
|
|
|
|
|
1. Collect all coach usernames from the specified site.
|
|
|
|
2. For each coach, download coach-specific data.
|
|
|
|
3. Transform this data and export into uniform format.
|
|
|
|
"""
|
|
|
|
os.makedirs(self.path_coaches_dir(), exist_ok=True)
|
|
|
|
os.makedirs(self.path_pages_dir(), exist_ok=True)
|
|
|
|
usernames = await self.download_usernames()
|
|
|
|
for username in usernames:
|
|
|
|
os.makedirs(self.path_coach_dir(username), exist_ok=True)
|
|
|
|
await self.download_profile(username)
|
|
|
|
|
|
|
|
export = await self.export(username)
|
|
|
|
with open(self.path_coach_file(username, "export.json"), "w") as f:
|
|
|
|
json.dump(export, f)
|
|
|
|
self.log(
|
|
|
|
[
|
|
|
|
(AnsiColor.INFO, "[INFO]"),
|
|
|
|
(None, ": Finished exporting "),
|
|
|
|
(AnsiColor.DATA, username),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
def path_coaches_dir(self):
|
|
|
|
"""The root directory for all coach-related downloads."""
|
|
|
|
return os.path.join("data", self.site, "coaches")
|
|
|
|
|
|
|
|
def path_coach_dir(self, username: str):
|
|
|
|
"""The root directory for a specific coach's downloads."""
|
|
|
|
return os.path.join(self.path_coaches_dir(), username)
|
|
|
|
|
|
|
|
def path_coach_file(self, username: str, filename: str):
|
|
|
|
"""Path to a coach-specific file download."""
|
|
|
|
return os.path.join(self.path_coach_dir(username), filename)
|
|
|
|
|
|
|
|
def path_pages_dir(self):
|
|
|
|
"""The root directory for all username listing files."""
|
|
|
|
return os.path.join("data", self.site, "pages")
|
|
|
|
|
|
|
|
def path_page_file(self, page_no: int):
|
|
|
|
"""The root directory for usernames scraped from a single page."""
|
|
|
|
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
|
|
|
|
|
|
|
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
|
|
|
|
transformed = []
|
|
|
|
for k, v in msgs:
|
|
|
|
if k is None:
|
|
|
|
transformed.append(v)
|
|
|
|
else:
|
|
|
|
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
|
|
|
|
|
|
|
|
print("".join(transformed))
|