Add class for wrapping around exports.

pull/3/head
Joshua Potter 2023-11-30 17:12:16 -07:00
parent bc2ffeae9d
commit e050d13aa7
6 changed files with 160 additions and 93 deletions

View File

@ -1,16 +1,23 @@
import aiohttp import aiohttp
import argparse import argparse
import asyncio import asyncio
import json
from app.chesscom import Scraper as ChesscomScraper from app.chesscom import (
from app.lichess import Scraper as LichessScraper Exporter as ChesscomExporter,
from app.scraper import Site Scraper as ChesscomScraper,
)
from app.lichess import (
Exporter as LichessExporter,
Scraper as LichessScraper,
)
from app.repo import Site
async def run(): async def run():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="coach-scraper", prog="coach-scraper",
description="HTML scraping of chess.com coaches.", description="Scraping/exporting of chess coaches.",
) )
parser.add_argument("-u", "--user-agent", required=True) parser.add_argument("-u", "--user-agent", required=True)
parser.add_argument( parser.add_argument(
@ -29,10 +36,19 @@ async def run():
) as session: ) as session:
if args.site == Site.CHESSCOM.value: if args.site == Site.CHESSCOM.value:
scraper = ChesscomScraper(session) scraper = ChesscomScraper(session)
exporter_cls = ChesscomExporter
elif args.site == Site.LICHESS.value: elif args.site == Site.LICHESS.value:
scraper = LichessScraper(session) scraper = LichessScraper(session)
exporter_cls = LichessExporter
await scraper.scrape() dump = {}
usernames = await scraper.scrape()
for username in usernames:
dump[username] = exporter_cls(username).export()
with open(scraper.path_site_file("export.json"), "w") as f:
json.dump(dump, f, indent=2)
def main(): def main():

View File

@ -4,9 +4,11 @@ import json
import os import os
import os.path import os.path
from app.scraper import AnsiColor, BaseScraper, Export, Site from app.repo import AnsiColor, Site
from app.exporter import BaseExporter
from app.scraper import BaseScraper
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import List from typing import List, Union
# The number of coach listing pages we will at most iterate through. This number # The number of coach listing pages we will at most iterate through. This number
@ -169,22 +171,19 @@ class Scraper(BaseScraper):
return True return True
def _load_stats_json(self, stats: dict) -> Export:
"""Extract relevant fields from a `stats.json` file."""
export: Export = {}
for stat in stats.get("stats", []):
if stat["key"] == "rapid":
export["fide_rapid"] = stat["stats"]["rating"]
return export
async def export(self, username: str) -> Export: class Exporter(BaseExporter):
"""Transform coach-specific data into uniform format.""" def __init__(self, username: str):
export: Export = {} super().__init__(site=Site.CHESSCOM.value, username=username)
self.stats_json = {}
try: try:
with open(self.path_coach_file(username, "stats.json"), "r") as f: with open(self.path_coach_file(username, "stats.json"), "r") as f:
export.update(self._load_stats_json(json.load(f))) for s in json.load(f).get("stats", []):
if "key" in s and "stats" in s:
self.stats_json[s["key"]] = s["stats"]
except FileNotFoundError: except FileNotFoundError:
pass pass
return export def export_fide_rapid(self) -> Union[int, None]:
return self.stats_json.get("rapid", {}).get("rating")

34
app/exporter.py Normal file
View File

@ -0,0 +1,34 @@
from app.repo import AnsiColor, Repo
from typing import Union
from typing_extensions import TypedDict
class Export(TypedDict, total=False):
fide_rapid: int
class BaseExporter(Repo):
def __init__(self, site: str, username: str):
super().__init__(site)
self.username = username
def export_fide_rapid(self) -> Union[int, None]:
raise NotImplementedError()
def export(self) -> Export:
"""Transform coach-specific data into uniform format."""
export: Export = {}
fide_rapid = self.export_fide_rapid()
if fide_rapid:
export["fide_rapid"] = fide_rapid
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Exported "),
(AnsiColor.DATA, self.username),
]
)
return export

View File

@ -3,7 +3,9 @@ import asyncio
import os import os
import os.path import os.path
from app.scraper import AnsiColor, BaseScraper, Export, Site from app.repo import AnsiColor, Site
from app.scraper import BaseScraper
from app.exporter import BaseExporter
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import List from typing import List
@ -111,7 +113,14 @@ class Scraper(BaseScraper):
""" """
filepath = self.path_coach_file(username, f"{username}.html") filepath = self.path_coach_file(username, f"{username}.html")
if os.path.isfile(filepath): if os.path.isfile(filepath):
return False self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Skipping download for coach "),
(AnsiColor.DATA, username),
]
)
return
response, _unused_status = await self.request( response, _unused_status = await self.request(
url=f"https://lichess.org/coach/{username}" url=f"https://lichess.org/coach/{username}"
@ -120,9 +129,18 @@ class Scraper(BaseScraper):
with open(filepath, "w") as f: with open(filepath, "w") as f:
f.write(response) f.write(response)
return True self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded data for coach "),
(AnsiColor.DATA, username),
]
)
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format.""" class Exporter(BaseExporter):
export: Export = {} def __init__(self, username: str):
return export super().__init__(site=Site.LICHESS.value, username=username)
def export_fide_rapid(self):
return None

61
app/repo.py Normal file
View File

@ -0,0 +1,61 @@
import enum
import os
from typing import List, Tuple, Union
class AnsiColor(enum.Enum):
ERROR = "\033[0;31m"
INFO = "\033[0;34m"
DATA = "\033[0;36m"
RESET = "\033[0m"
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class Repo:
"""Shared filesystem-related functionality."""
def __init__(self, site: str):
self.site = site
def path_site_dir(self):
"""The root directory for all site-related files."""
return os.path.join("data", self.site)
def path_site_file(self, filename: str):
"""Path to a top-level site-related file."""
return os.path.join(self.path_site_dir(), filename)
def path_coaches_dir(self):
"""The root directory for all coach-related downloads."""
return os.path.join(self.path_site_dir(), "coaches")
def path_coach_dir(self, username: str):
"""The root directory for a specific coach's downloads."""
return os.path.join(self.path_coaches_dir(), username)
def path_coach_file(self, username: str, filename: str):
"""Path to a coach-specific file download."""
return os.path.join(self.path_coach_dir(username), filename)
def path_pages_dir(self):
"""The root directory for all username listing files."""
return os.path.join(self.path_site_dir(), "pages")
def path_page_file(self, page_no: int):
"""The root directory for usernames scraped from a single page."""
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
transformed = []
for k, v in msgs:
if k is None:
transformed.append(v)
else:
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
print("".join(transformed))

View File

@ -1,38 +1,20 @@
import aiohttp import aiohttp
import enum
import json
import os import os
from app.repo import Repo
from typing import List, Tuple, Union from typing import List, Tuple, Union
from typing_extensions import TypedDict
class Site(enum.Enum): class BaseScraper(Repo):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class AnsiColor(enum.Enum):
ERROR = "\033[0;31m"
INFO = "\033[0;34m"
DATA = "\033[0;36m"
RESET = "\033[0m"
class Export(TypedDict, total=False):
fide_rapid: Union[int, None]
class BaseScraper:
def __init__(self, site: str, session: aiohttp.ClientSession): def __init__(self, site: str, session: aiohttp.ClientSession):
"""Initialize a new web scraper and exporter. """Initialize a new web scraper.
@param site: @param site:
The site we are making requests out to. The site we are making requests out to.
@param session: @param session:
The `aiohttp.ClientSession` context our requests are made from. The `aiohttp.ClientSession` context our requests are made from.
""" """
self.site = site super().__init__(site)
self.session = session self.session = session
async def download_usernames(self) -> List[str]: async def download_usernames(self) -> List[str]:
@ -43,10 +25,6 @@ class BaseScraper:
"""For each coach, download coach-specific data.""" """For each coach, download coach-specific data."""
raise NotImplementedError() raise NotImplementedError()
async def export(self, username: str) -> Export:
"""Transform coach-specific data into uniform format."""
raise NotImplementedError()
async def request(self, url: str) -> Tuple[Union[str, None], int]: async def request(self, url: str) -> Tuple[Union[str, None], int]:
"""Make network requests using the internal session. """Make network requests using the internal session.
@ -61,7 +39,7 @@ class BaseScraper:
return await response.text(), 200 return await response.text(), 200
return None, response.status return None, response.status
async def scrape(self): async def scrape(self) -> List[str]:
"""Main entrypoint for scraping and exporting downloaded content. """Main entrypoint for scraping and exporting downloaded content.
A `Scraper` is structured to operates in the following stages: A `Scraper` is structured to operates in the following stages:
@ -77,43 +55,4 @@ class BaseScraper:
os.makedirs(self.path_coach_dir(username), exist_ok=True) os.makedirs(self.path_coach_dir(username), exist_ok=True)
await self.download_profile(username) await self.download_profile(username)
export = await self.export(username) return usernames
with open(self.path_coach_file(username, "export.json"), "w") as f:
json.dump(export, f)
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Finished exporting "),
(AnsiColor.DATA, username),
]
)
def path_coaches_dir(self):
"""The root directory for all coach-related downloads."""
return os.path.join("data", self.site, "coaches")
def path_coach_dir(self, username: str):
"""The root directory for a specific coach's downloads."""
return os.path.join(self.path_coaches_dir(), username)
def path_coach_file(self, username: str, filename: str):
"""Path to a coach-specific file download."""
return os.path.join(self.path_coach_dir(username), filename)
def path_pages_dir(self):
"""The root directory for all username listing files."""
return os.path.join("data", self.site, "pages")
def path_page_file(self, page_no: int):
"""The root directory for usernames scraped from a single page."""
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
transformed = []
for k, v in msgs:
if k is None:
transformed.append(v)
else:
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
print("".join(transformed))