Add class for wrapping around exports.

pull/3/head
Joshua Potter 2023-11-30 17:12:16 -07:00
parent bc2ffeae9d
commit e050d13aa7
6 changed files with 160 additions and 93 deletions

View File

@ -1,16 +1,23 @@
import aiohttp
import argparse
import asyncio
import json
from app.chesscom import Scraper as ChesscomScraper
from app.lichess import Scraper as LichessScraper
from app.scraper import Site
from app.chesscom import (
Exporter as ChesscomExporter,
Scraper as ChesscomScraper,
)
from app.lichess import (
Exporter as LichessExporter,
Scraper as LichessScraper,
)
from app.repo import Site
async def run():
parser = argparse.ArgumentParser(
prog="coach-scraper",
description="HTML scraping of chess.com coaches.",
description="Scraping/exporting of chess coaches.",
)
parser.add_argument("-u", "--user-agent", required=True)
parser.add_argument(
@ -29,10 +36,19 @@ async def run():
) as session:
if args.site == Site.CHESSCOM.value:
scraper = ChesscomScraper(session)
exporter_cls = ChesscomExporter
elif args.site == Site.LICHESS.value:
scraper = LichessScraper(session)
exporter_cls = LichessExporter
await scraper.scrape()
dump = {}
usernames = await scraper.scrape()
for username in usernames:
dump[username] = exporter_cls(username).export()
with open(scraper.path_site_file("export.json"), "w") as f:
json.dump(dump, f, indent=2)
def main():

View File

@ -4,9 +4,11 @@ import json
import os
import os.path
from app.scraper import AnsiColor, BaseScraper, Export, Site
from app.repo import AnsiColor, Site
from app.exporter import BaseExporter
from app.scraper import BaseScraper
from bs4 import BeautifulSoup
from typing import List
from typing import List, Union
# The number of coach listing pages we will at most iterate through. This number
@ -169,22 +171,19 @@ class Scraper(BaseScraper):
return True
def _load_stats_json(self, stats: dict) -> Export:
"""Extract relevant fields from a `stats.json` file."""
export: Export = {}
for stat in stats.get("stats", []):
if stat["key"] == "rapid":
export["fide_rapid"] = stat["stats"]["rating"]
return export
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
export: Export = {}
class Exporter(BaseExporter):
def __init__(self, username: str):
super().__init__(site=Site.CHESSCOM.value, username=username)
self.stats_json = {}
try:
with open(self.path_coach_file(username, "stats.json"), "r") as f:
export.update(self._load_stats_json(json.load(f)))
for s in json.load(f).get("stats", []):
if "key" in s and "stats" in s:
self.stats_json[s["key"]] = s["stats"]
except FileNotFoundError:
pass
return export
def export_fide_rapid(self) -> Union[int, None]:
return self.stats_json.get("rapid", {}).get("rating")

34
app/exporter.py Normal file
View File

@ -0,0 +1,34 @@
from app.repo import AnsiColor, Repo
from typing import Union
from typing_extensions import TypedDict
class Export(TypedDict, total=False):
fide_rapid: int
class BaseExporter(Repo):
def __init__(self, site: str, username: str):
super().__init__(site)
self.username = username
def export_fide_rapid(self) -> Union[int, None]:
raise NotImplementedError()
def export(self) -> Export:
"""Transform coach-specific data into uniform format."""
export: Export = {}
fide_rapid = self.export_fide_rapid()
if fide_rapid:
export["fide_rapid"] = fide_rapid
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Exported "),
(AnsiColor.DATA, self.username),
]
)
return export

View File

@ -3,7 +3,9 @@ import asyncio
import os
import os.path
from app.scraper import AnsiColor, BaseScraper, Export, Site
from app.repo import AnsiColor, Site
from app.scraper import BaseScraper
from app.exporter import BaseExporter
from bs4 import BeautifulSoup
from typing import List
@ -111,7 +113,14 @@ class Scraper(BaseScraper):
"""
filepath = self.path_coach_file(username, f"{username}.html")
if os.path.isfile(filepath):
return False
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Skipping download for coach "),
(AnsiColor.DATA, username),
]
)
return
response, _unused_status = await self.request(
url=f"https://lichess.org/coach/{username}"
@ -120,9 +129,18 @@ class Scraper(BaseScraper):
with open(filepath, "w") as f:
f.write(response)
return True
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded data for coach "),
(AnsiColor.DATA, username),
]
)
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
export: Export = {}
return export
class Exporter(BaseExporter):
def __init__(self, username: str):
super().__init__(site=Site.LICHESS.value, username=username)
def export_fide_rapid(self):
return None

61
app/repo.py Normal file
View File

@ -0,0 +1,61 @@
import enum
import os
from typing import List, Tuple, Union
class AnsiColor(enum.Enum):
ERROR = "\033[0;31m"
INFO = "\033[0;34m"
DATA = "\033[0;36m"
RESET = "\033[0m"
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class Repo:
"""Shared filesystem-related functionality."""
def __init__(self, site: str):
self.site = site
def path_site_dir(self):
"""The root directory for all site-related files."""
return os.path.join("data", self.site)
def path_site_file(self, filename: str):
"""Path to a top-level site-related file."""
return os.path.join(self.path_site_dir(), filename)
def path_coaches_dir(self):
"""The root directory for all coach-related downloads."""
return os.path.join(self.path_site_dir(), "coaches")
def path_coach_dir(self, username: str):
"""The root directory for a specific coach's downloads."""
return os.path.join(self.path_coaches_dir(), username)
def path_coach_file(self, username: str, filename: str):
"""Path to a coach-specific file download."""
return os.path.join(self.path_coach_dir(username), filename)
def path_pages_dir(self):
"""The root directory for all username listing files."""
return os.path.join(self.path_site_dir(), "pages")
def path_page_file(self, page_no: int):
"""The root directory for usernames scraped from a single page."""
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
transformed = []
for k, v in msgs:
if k is None:
transformed.append(v)
else:
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
print("".join(transformed))

View File

@ -1,38 +1,20 @@
import aiohttp
import enum
import json
import os
from app.repo import Repo
from typing import List, Tuple, Union
from typing_extensions import TypedDict
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class AnsiColor(enum.Enum):
ERROR = "\033[0;31m"
INFO = "\033[0;34m"
DATA = "\033[0;36m"
RESET = "\033[0m"
class Export(TypedDict, total=False):
fide_rapid: Union[int, None]
class BaseScraper:
class BaseScraper(Repo):
def __init__(self, site: str, session: aiohttp.ClientSession):
"""Initialize a new web scraper and exporter.
"""Initialize a new web scraper.
@param site:
The site we are making requests out to.
@param session:
The `aiohttp.ClientSession` context our requests are made from.
"""
self.site = site
super().__init__(site)
self.session = session
async def download_usernames(self) -> List[str]:
@ -43,10 +25,6 @@ class BaseScraper:
"""For each coach, download coach-specific data."""
raise NotImplementedError()
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
raise NotImplementedError()
async def request(self, url: str) -> Tuple[Union[str, None], int]:
"""Make network requests using the internal session.
@ -61,7 +39,7 @@ class BaseScraper:
return await response.text(), 200
return None, response.status
async def scrape(self):
async def scrape(self) -> List[str]:
"""Main entrypoint for scraping and exporting downloaded content.
A `Scraper` is structured to operates in the following stages:
@ -77,43 +55,4 @@ class BaseScraper:
os.makedirs(self.path_coach_dir(username), exist_ok=True)
await self.download_profile(username)
export = await self.export(username)
with open(self.path_coach_file(username, "export.json"), "w") as f:
json.dump(export, f)
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Finished exporting "),
(AnsiColor.DATA, username),
]
)
def path_coaches_dir(self):
"""The root directory for all coach-related downloads."""
return os.path.join("data", self.site, "coaches")
def path_coach_dir(self, username: str):
"""The root directory for a specific coach's downloads."""
return os.path.join(self.path_coaches_dir(), username)
def path_coach_file(self, username: str, filename: str):
"""Path to a coach-specific file download."""
return os.path.join(self.path_coach_dir(username), filename)
def path_pages_dir(self):
"""The root directory for all username listing files."""
return os.path.join("data", self.site, "pages")
def path_page_file(self, page_no: int):
"""The root directory for usernames scraped from a single page."""
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
transformed = []
for k, v in msgs:
if k is None:
transformed.append(v)
else:
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
print("".join(transformed))
return usernames