Add class for wrapping around exports.
parent
bc2ffeae9d
commit
e050d13aa7
|
@ -1,16 +1,23 @@
|
|||
import aiohttp
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from app.chesscom import Scraper as ChesscomScraper
|
||||
from app.lichess import Scraper as LichessScraper
|
||||
from app.scraper import Site
|
||||
from app.chesscom import (
|
||||
Exporter as ChesscomExporter,
|
||||
Scraper as ChesscomScraper,
|
||||
)
|
||||
from app.lichess import (
|
||||
Exporter as LichessExporter,
|
||||
Scraper as LichessScraper,
|
||||
)
|
||||
from app.repo import Site
|
||||
|
||||
|
||||
async def run():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="coach-scraper",
|
||||
description="HTML scraping of chess.com coaches.",
|
||||
description="Scraping/exporting of chess coaches.",
|
||||
)
|
||||
parser.add_argument("-u", "--user-agent", required=True)
|
||||
parser.add_argument(
|
||||
|
@ -29,10 +36,19 @@ async def run():
|
|||
) as session:
|
||||
if args.site == Site.CHESSCOM.value:
|
||||
scraper = ChesscomScraper(session)
|
||||
exporter_cls = ChesscomExporter
|
||||
elif args.site == Site.LICHESS.value:
|
||||
scraper = LichessScraper(session)
|
||||
exporter_cls = LichessExporter
|
||||
|
||||
await scraper.scrape()
|
||||
dump = {}
|
||||
|
||||
usernames = await scraper.scrape()
|
||||
for username in usernames:
|
||||
dump[username] = exporter_cls(username).export()
|
||||
|
||||
with open(scraper.path_site_file("export.json"), "w") as f:
|
||||
json.dump(dump, f, indent=2)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -4,9 +4,11 @@ import json
|
|||
import os
|
||||
import os.path
|
||||
|
||||
from app.scraper import AnsiColor, BaseScraper, Export, Site
|
||||
from app.repo import AnsiColor, Site
|
||||
from app.exporter import BaseExporter
|
||||
from app.scraper import BaseScraper
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
|
||||
# The number of coach listing pages we will at most iterate through. This number
|
||||
|
@ -169,22 +171,19 @@ class Scraper(BaseScraper):
|
|||
|
||||
return True
|
||||
|
||||
def _load_stats_json(self, stats: dict) -> Export:
|
||||
"""Extract relevant fields from a `stats.json` file."""
|
||||
export: Export = {}
|
||||
for stat in stats.get("stats", []):
|
||||
if stat["key"] == "rapid":
|
||||
export["fide_rapid"] = stat["stats"]["rating"]
|
||||
return export
|
||||
|
||||
async def export(self, username: str) -> Export:
|
||||
"""Transform coach-specific data into uniform format."""
|
||||
export: Export = {}
|
||||
class Exporter(BaseExporter):
|
||||
def __init__(self, username: str):
|
||||
super().__init__(site=Site.CHESSCOM.value, username=username)
|
||||
|
||||
self.stats_json = {}
|
||||
try:
|
||||
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
||||
export.update(self._load_stats_json(json.load(f)))
|
||||
for s in json.load(f).get("stats", []):
|
||||
if "key" in s and "stats" in s:
|
||||
self.stats_json[s["key"]] = s["stats"]
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
return export
|
||||
def export_fide_rapid(self) -> Union[int, None]:
|
||||
return self.stats_json.get("rapid", {}).get("rating")
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
from app.repo import AnsiColor, Repo
|
||||
from typing import Union
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
|
||||
class Export(TypedDict, total=False):
|
||||
fide_rapid: int
|
||||
|
||||
|
||||
class BaseExporter(Repo):
|
||||
def __init__(self, site: str, username: str):
|
||||
super().__init__(site)
|
||||
self.username = username
|
||||
|
||||
def export_fide_rapid(self) -> Union[int, None]:
|
||||
raise NotImplementedError()
|
||||
|
||||
def export(self) -> Export:
|
||||
"""Transform coach-specific data into uniform format."""
|
||||
export: Export = {}
|
||||
|
||||
fide_rapid = self.export_fide_rapid()
|
||||
if fide_rapid:
|
||||
export["fide_rapid"] = fide_rapid
|
||||
|
||||
self.log(
|
||||
[
|
||||
(AnsiColor.INFO, "[INFO]"),
|
||||
(None, ": Exported "),
|
||||
(AnsiColor.DATA, self.username),
|
||||
]
|
||||
)
|
||||
|
||||
return export
|
|
@ -3,7 +3,9 @@ import asyncio
|
|||
import os
|
||||
import os.path
|
||||
|
||||
from app.scraper import AnsiColor, BaseScraper, Export, Site
|
||||
from app.repo import AnsiColor, Site
|
||||
from app.scraper import BaseScraper
|
||||
from app.exporter import BaseExporter
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List
|
||||
|
||||
|
@ -111,7 +113,14 @@ class Scraper(BaseScraper):
|
|||
"""
|
||||
filepath = self.path_coach_file(username, f"{username}.html")
|
||||
if os.path.isfile(filepath):
|
||||
return False
|
||||
self.log(
|
||||
[
|
||||
(AnsiColor.INFO, "[INFO]"),
|
||||
(None, ": Skipping download for coach "),
|
||||
(AnsiColor.DATA, username),
|
||||
]
|
||||
)
|
||||
return
|
||||
|
||||
response, _unused_status = await self.request(
|
||||
url=f"https://lichess.org/coach/{username}"
|
||||
|
@ -120,9 +129,18 @@ class Scraper(BaseScraper):
|
|||
with open(filepath, "w") as f:
|
||||
f.write(response)
|
||||
|
||||
return True
|
||||
self.log(
|
||||
[
|
||||
(AnsiColor.INFO, "[INFO]"),
|
||||
(None, ": Downloaded data for coach "),
|
||||
(AnsiColor.DATA, username),
|
||||
]
|
||||
)
|
||||
|
||||
async def export(self, username: str) -> Export:
|
||||
"""Transform coach-specific data into uniform format."""
|
||||
export: Export = {}
|
||||
return export
|
||||
|
||||
class Exporter(BaseExporter):
|
||||
def __init__(self, username: str):
|
||||
super().__init__(site=Site.LICHESS.value, username=username)
|
||||
|
||||
def export_fide_rapid(self):
|
||||
return None
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
import enum
|
||||
import os
|
||||
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
|
||||
class AnsiColor(enum.Enum):
|
||||
ERROR = "\033[0;31m"
|
||||
INFO = "\033[0;34m"
|
||||
DATA = "\033[0;36m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
class Site(enum.Enum):
|
||||
CHESSCOM = "chesscom"
|
||||
LICHESS = "lichess"
|
||||
|
||||
|
||||
class Repo:
|
||||
"""Shared filesystem-related functionality."""
|
||||
|
||||
def __init__(self, site: str):
|
||||
self.site = site
|
||||
|
||||
def path_site_dir(self):
|
||||
"""The root directory for all site-related files."""
|
||||
return os.path.join("data", self.site)
|
||||
|
||||
def path_site_file(self, filename: str):
|
||||
"""Path to a top-level site-related file."""
|
||||
return os.path.join(self.path_site_dir(), filename)
|
||||
|
||||
def path_coaches_dir(self):
|
||||
"""The root directory for all coach-related downloads."""
|
||||
return os.path.join(self.path_site_dir(), "coaches")
|
||||
|
||||
def path_coach_dir(self, username: str):
|
||||
"""The root directory for a specific coach's downloads."""
|
||||
return os.path.join(self.path_coaches_dir(), username)
|
||||
|
||||
def path_coach_file(self, username: str, filename: str):
|
||||
"""Path to a coach-specific file download."""
|
||||
return os.path.join(self.path_coach_dir(username), filename)
|
||||
|
||||
def path_pages_dir(self):
|
||||
"""The root directory for all username listing files."""
|
||||
return os.path.join(self.path_site_dir(), "pages")
|
||||
|
||||
def path_page_file(self, page_no: int):
|
||||
"""The root directory for usernames scraped from a single page."""
|
||||
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
||||
|
||||
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
|
||||
transformed = []
|
||||
for k, v in msgs:
|
||||
if k is None:
|
||||
transformed.append(v)
|
||||
else:
|
||||
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
|
||||
|
||||
print("".join(transformed))
|
|
@ -1,38 +1,20 @@
|
|||
import aiohttp
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
|
||||
from app.repo import Repo
|
||||
from typing import List, Tuple, Union
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
|
||||
class Site(enum.Enum):
|
||||
CHESSCOM = "chesscom"
|
||||
LICHESS = "lichess"
|
||||
|
||||
|
||||
class AnsiColor(enum.Enum):
|
||||
ERROR = "\033[0;31m"
|
||||
INFO = "\033[0;34m"
|
||||
DATA = "\033[0;36m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
class Export(TypedDict, total=False):
|
||||
fide_rapid: Union[int, None]
|
||||
|
||||
|
||||
class BaseScraper:
|
||||
class BaseScraper(Repo):
|
||||
def __init__(self, site: str, session: aiohttp.ClientSession):
|
||||
"""Initialize a new web scraper and exporter.
|
||||
"""Initialize a new web scraper.
|
||||
|
||||
@param site:
|
||||
The site we are making requests out to.
|
||||
@param session:
|
||||
The `aiohttp.ClientSession` context our requests are made from.
|
||||
"""
|
||||
self.site = site
|
||||
super().__init__(site)
|
||||
self.session = session
|
||||
|
||||
async def download_usernames(self) -> List[str]:
|
||||
|
@ -43,10 +25,6 @@ class BaseScraper:
|
|||
"""For each coach, download coach-specific data."""
|
||||
raise NotImplementedError()
|
||||
|
||||
async def export(self, username: str) -> Export:
|
||||
"""Transform coach-specific data into uniform format."""
|
||||
raise NotImplementedError()
|
||||
|
||||
async def request(self, url: str) -> Tuple[Union[str, None], int]:
|
||||
"""Make network requests using the internal session.
|
||||
|
||||
|
@ -61,7 +39,7 @@ class BaseScraper:
|
|||
return await response.text(), 200
|
||||
return None, response.status
|
||||
|
||||
async def scrape(self):
|
||||
async def scrape(self) -> List[str]:
|
||||
"""Main entrypoint for scraping and exporting downloaded content.
|
||||
|
||||
A `Scraper` is structured to operates in the following stages:
|
||||
|
@ -77,43 +55,4 @@ class BaseScraper:
|
|||
os.makedirs(self.path_coach_dir(username), exist_ok=True)
|
||||
await self.download_profile(username)
|
||||
|
||||
export = await self.export(username)
|
||||
with open(self.path_coach_file(username, "export.json"), "w") as f:
|
||||
json.dump(export, f)
|
||||
self.log(
|
||||
[
|
||||
(AnsiColor.INFO, "[INFO]"),
|
||||
(None, ": Finished exporting "),
|
||||
(AnsiColor.DATA, username),
|
||||
]
|
||||
)
|
||||
|
||||
def path_coaches_dir(self):
|
||||
"""The root directory for all coach-related downloads."""
|
||||
return os.path.join("data", self.site, "coaches")
|
||||
|
||||
def path_coach_dir(self, username: str):
|
||||
"""The root directory for a specific coach's downloads."""
|
||||
return os.path.join(self.path_coaches_dir(), username)
|
||||
|
||||
def path_coach_file(self, username: str, filename: str):
|
||||
"""Path to a coach-specific file download."""
|
||||
return os.path.join(self.path_coach_dir(username), filename)
|
||||
|
||||
def path_pages_dir(self):
|
||||
"""The root directory for all username listing files."""
|
||||
return os.path.join("data", self.site, "pages")
|
||||
|
||||
def path_page_file(self, page_no: int):
|
||||
"""The root directory for usernames scraped from a single page."""
|
||||
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
||||
|
||||
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
|
||||
transformed = []
|
||||
for k, v in msgs:
|
||||
if k is None:
|
||||
transformed.append(v)
|
||||
else:
|
||||
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
|
||||
|
||||
print("".join(transformed))
|
||||
return usernames
|
||||
|
|
Loading…
Reference in New Issue