Add class for wrapping around exports.
parent
bc2ffeae9d
commit
e050d13aa7
|
@ -1,16 +1,23 @@
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import json
|
||||||
|
|
||||||
from app.chesscom import Scraper as ChesscomScraper
|
from app.chesscom import (
|
||||||
from app.lichess import Scraper as LichessScraper
|
Exporter as ChesscomExporter,
|
||||||
from app.scraper import Site
|
Scraper as ChesscomScraper,
|
||||||
|
)
|
||||||
|
from app.lichess import (
|
||||||
|
Exporter as LichessExporter,
|
||||||
|
Scraper as LichessScraper,
|
||||||
|
)
|
||||||
|
from app.repo import Site
|
||||||
|
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog="coach-scraper",
|
prog="coach-scraper",
|
||||||
description="HTML scraping of chess.com coaches.",
|
description="Scraping/exporting of chess coaches.",
|
||||||
)
|
)
|
||||||
parser.add_argument("-u", "--user-agent", required=True)
|
parser.add_argument("-u", "--user-agent", required=True)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -29,10 +36,19 @@ async def run():
|
||||||
) as session:
|
) as session:
|
||||||
if args.site == Site.CHESSCOM.value:
|
if args.site == Site.CHESSCOM.value:
|
||||||
scraper = ChesscomScraper(session)
|
scraper = ChesscomScraper(session)
|
||||||
|
exporter_cls = ChesscomExporter
|
||||||
elif args.site == Site.LICHESS.value:
|
elif args.site == Site.LICHESS.value:
|
||||||
scraper = LichessScraper(session)
|
scraper = LichessScraper(session)
|
||||||
|
exporter_cls = LichessExporter
|
||||||
|
|
||||||
await scraper.scrape()
|
dump = {}
|
||||||
|
|
||||||
|
usernames = await scraper.scrape()
|
||||||
|
for username in usernames:
|
||||||
|
dump[username] = exporter_cls(username).export()
|
||||||
|
|
||||||
|
with open(scraper.path_site_file("export.json"), "w") as f:
|
||||||
|
json.dump(dump, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -4,9 +4,11 @@ import json
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
from app.scraper import AnsiColor, BaseScraper, Export, Site
|
from app.repo import AnsiColor, Site
|
||||||
|
from app.exporter import BaseExporter
|
||||||
|
from app.scraper import BaseScraper
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import List
|
from typing import List, Union
|
||||||
|
|
||||||
|
|
||||||
# The number of coach listing pages we will at most iterate through. This number
|
# The number of coach listing pages we will at most iterate through. This number
|
||||||
|
@ -169,22 +171,19 @@ class Scraper(BaseScraper):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _load_stats_json(self, stats: dict) -> Export:
|
|
||||||
"""Extract relevant fields from a `stats.json` file."""
|
|
||||||
export: Export = {}
|
|
||||||
for stat in stats.get("stats", []):
|
|
||||||
if stat["key"] == "rapid":
|
|
||||||
export["fide_rapid"] = stat["stats"]["rating"]
|
|
||||||
return export
|
|
||||||
|
|
||||||
async def export(self, username: str) -> Export:
|
class Exporter(BaseExporter):
|
||||||
"""Transform coach-specific data into uniform format."""
|
def __init__(self, username: str):
|
||||||
export: Export = {}
|
super().__init__(site=Site.CHESSCOM.value, username=username)
|
||||||
|
|
||||||
|
self.stats_json = {}
|
||||||
try:
|
try:
|
||||||
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
||||||
export.update(self._load_stats_json(json.load(f)))
|
for s in json.load(f).get("stats", []):
|
||||||
|
if "key" in s and "stats" in s:
|
||||||
|
self.stats_json[s["key"]] = s["stats"]
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return export
|
def export_fide_rapid(self) -> Union[int, None]:
|
||||||
|
return self.stats_json.get("rapid", {}).get("rating")
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
from app.repo import AnsiColor, Repo
|
||||||
|
from typing import Union
|
||||||
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
|
|
||||||
|
class Export(TypedDict, total=False):
|
||||||
|
fide_rapid: int
|
||||||
|
|
||||||
|
|
||||||
|
class BaseExporter(Repo):
|
||||||
|
def __init__(self, site: str, username: str):
|
||||||
|
super().__init__(site)
|
||||||
|
self.username = username
|
||||||
|
|
||||||
|
def export_fide_rapid(self) -> Union[int, None]:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def export(self) -> Export:
|
||||||
|
"""Transform coach-specific data into uniform format."""
|
||||||
|
export: Export = {}
|
||||||
|
|
||||||
|
fide_rapid = self.export_fide_rapid()
|
||||||
|
if fide_rapid:
|
||||||
|
export["fide_rapid"] = fide_rapid
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Exported "),
|
||||||
|
(AnsiColor.DATA, self.username),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return export
|
|
@ -3,7 +3,9 @@ import asyncio
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
from app.scraper import AnsiColor, BaseScraper, Export, Site
|
from app.repo import AnsiColor, Site
|
||||||
|
from app.scraper import BaseScraper
|
||||||
|
from app.exporter import BaseExporter
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
@ -111,7 +113,14 @@ class Scraper(BaseScraper):
|
||||||
"""
|
"""
|
||||||
filepath = self.path_coach_file(username, f"{username}.html")
|
filepath = self.path_coach_file(username, f"{username}.html")
|
||||||
if os.path.isfile(filepath):
|
if os.path.isfile(filepath):
|
||||||
return False
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Skipping download for coach "),
|
||||||
|
(AnsiColor.DATA, username),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
response, _unused_status = await self.request(
|
response, _unused_status = await self.request(
|
||||||
url=f"https://lichess.org/coach/{username}"
|
url=f"https://lichess.org/coach/{username}"
|
||||||
|
@ -120,9 +129,18 @@ class Scraper(BaseScraper):
|
||||||
with open(filepath, "w") as f:
|
with open(filepath, "w") as f:
|
||||||
f.write(response)
|
f.write(response)
|
||||||
|
|
||||||
return True
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Downloaded data for coach "),
|
||||||
|
(AnsiColor.DATA, username),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
async def export(self, username: str) -> Export:
|
|
||||||
"""Transform coach-specific data into uniform format."""
|
class Exporter(BaseExporter):
|
||||||
export: Export = {}
|
def __init__(self, username: str):
|
||||||
return export
|
super().__init__(site=Site.LICHESS.value, username=username)
|
||||||
|
|
||||||
|
def export_fide_rapid(self):
|
||||||
|
return None
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
import enum
|
||||||
|
import os
|
||||||
|
|
||||||
|
from typing import List, Tuple, Union
|
||||||
|
|
||||||
|
|
||||||
|
class AnsiColor(enum.Enum):
|
||||||
|
ERROR = "\033[0;31m"
|
||||||
|
INFO = "\033[0;34m"
|
||||||
|
DATA = "\033[0;36m"
|
||||||
|
RESET = "\033[0m"
|
||||||
|
|
||||||
|
|
||||||
|
class Site(enum.Enum):
|
||||||
|
CHESSCOM = "chesscom"
|
||||||
|
LICHESS = "lichess"
|
||||||
|
|
||||||
|
|
||||||
|
class Repo:
|
||||||
|
"""Shared filesystem-related functionality."""
|
||||||
|
|
||||||
|
def __init__(self, site: str):
|
||||||
|
self.site = site
|
||||||
|
|
||||||
|
def path_site_dir(self):
|
||||||
|
"""The root directory for all site-related files."""
|
||||||
|
return os.path.join("data", self.site)
|
||||||
|
|
||||||
|
def path_site_file(self, filename: str):
|
||||||
|
"""Path to a top-level site-related file."""
|
||||||
|
return os.path.join(self.path_site_dir(), filename)
|
||||||
|
|
||||||
|
def path_coaches_dir(self):
|
||||||
|
"""The root directory for all coach-related downloads."""
|
||||||
|
return os.path.join(self.path_site_dir(), "coaches")
|
||||||
|
|
||||||
|
def path_coach_dir(self, username: str):
|
||||||
|
"""The root directory for a specific coach's downloads."""
|
||||||
|
return os.path.join(self.path_coaches_dir(), username)
|
||||||
|
|
||||||
|
def path_coach_file(self, username: str, filename: str):
|
||||||
|
"""Path to a coach-specific file download."""
|
||||||
|
return os.path.join(self.path_coach_dir(username), filename)
|
||||||
|
|
||||||
|
def path_pages_dir(self):
|
||||||
|
"""The root directory for all username listing files."""
|
||||||
|
return os.path.join(self.path_site_dir(), "pages")
|
||||||
|
|
||||||
|
def path_page_file(self, page_no: int):
|
||||||
|
"""The root directory for usernames scraped from a single page."""
|
||||||
|
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
||||||
|
|
||||||
|
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
|
||||||
|
transformed = []
|
||||||
|
for k, v in msgs:
|
||||||
|
if k is None:
|
||||||
|
transformed.append(v)
|
||||||
|
else:
|
||||||
|
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
|
||||||
|
|
||||||
|
print("".join(transformed))
|
|
@ -1,38 +1,20 @@
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import enum
|
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from app.repo import Repo
|
||||||
from typing import List, Tuple, Union
|
from typing import List, Tuple, Union
|
||||||
from typing_extensions import TypedDict
|
|
||||||
|
|
||||||
|
|
||||||
class Site(enum.Enum):
|
class BaseScraper(Repo):
|
||||||
CHESSCOM = "chesscom"
|
|
||||||
LICHESS = "lichess"
|
|
||||||
|
|
||||||
|
|
||||||
class AnsiColor(enum.Enum):
|
|
||||||
ERROR = "\033[0;31m"
|
|
||||||
INFO = "\033[0;34m"
|
|
||||||
DATA = "\033[0;36m"
|
|
||||||
RESET = "\033[0m"
|
|
||||||
|
|
||||||
|
|
||||||
class Export(TypedDict, total=False):
|
|
||||||
fide_rapid: Union[int, None]
|
|
||||||
|
|
||||||
|
|
||||||
class BaseScraper:
|
|
||||||
def __init__(self, site: str, session: aiohttp.ClientSession):
|
def __init__(self, site: str, session: aiohttp.ClientSession):
|
||||||
"""Initialize a new web scraper and exporter.
|
"""Initialize a new web scraper.
|
||||||
|
|
||||||
@param site:
|
@param site:
|
||||||
The site we are making requests out to.
|
The site we are making requests out to.
|
||||||
@param session:
|
@param session:
|
||||||
The `aiohttp.ClientSession` context our requests are made from.
|
The `aiohttp.ClientSession` context our requests are made from.
|
||||||
"""
|
"""
|
||||||
self.site = site
|
super().__init__(site)
|
||||||
self.session = session
|
self.session = session
|
||||||
|
|
||||||
async def download_usernames(self) -> List[str]:
|
async def download_usernames(self) -> List[str]:
|
||||||
|
@ -43,10 +25,6 @@ class BaseScraper:
|
||||||
"""For each coach, download coach-specific data."""
|
"""For each coach, download coach-specific data."""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
async def export(self, username: str) -> Export:
|
|
||||||
"""Transform coach-specific data into uniform format."""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
async def request(self, url: str) -> Tuple[Union[str, None], int]:
|
async def request(self, url: str) -> Tuple[Union[str, None], int]:
|
||||||
"""Make network requests using the internal session.
|
"""Make network requests using the internal session.
|
||||||
|
|
||||||
|
@ -61,7 +39,7 @@ class BaseScraper:
|
||||||
return await response.text(), 200
|
return await response.text(), 200
|
||||||
return None, response.status
|
return None, response.status
|
||||||
|
|
||||||
async def scrape(self):
|
async def scrape(self) -> List[str]:
|
||||||
"""Main entrypoint for scraping and exporting downloaded content.
|
"""Main entrypoint for scraping and exporting downloaded content.
|
||||||
|
|
||||||
A `Scraper` is structured to operates in the following stages:
|
A `Scraper` is structured to operates in the following stages:
|
||||||
|
@ -77,43 +55,4 @@ class BaseScraper:
|
||||||
os.makedirs(self.path_coach_dir(username), exist_ok=True)
|
os.makedirs(self.path_coach_dir(username), exist_ok=True)
|
||||||
await self.download_profile(username)
|
await self.download_profile(username)
|
||||||
|
|
||||||
export = await self.export(username)
|
return usernames
|
||||||
with open(self.path_coach_file(username, "export.json"), "w") as f:
|
|
||||||
json.dump(export, f)
|
|
||||||
self.log(
|
|
||||||
[
|
|
||||||
(AnsiColor.INFO, "[INFO]"),
|
|
||||||
(None, ": Finished exporting "),
|
|
||||||
(AnsiColor.DATA, username),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
def path_coaches_dir(self):
|
|
||||||
"""The root directory for all coach-related downloads."""
|
|
||||||
return os.path.join("data", self.site, "coaches")
|
|
||||||
|
|
||||||
def path_coach_dir(self, username: str):
|
|
||||||
"""The root directory for a specific coach's downloads."""
|
|
||||||
return os.path.join(self.path_coaches_dir(), username)
|
|
||||||
|
|
||||||
def path_coach_file(self, username: str, filename: str):
|
|
||||||
"""Path to a coach-specific file download."""
|
|
||||||
return os.path.join(self.path_coach_dir(username), filename)
|
|
||||||
|
|
||||||
def path_pages_dir(self):
|
|
||||||
"""The root directory for all username listing files."""
|
|
||||||
return os.path.join("data", self.site, "pages")
|
|
||||||
|
|
||||||
def path_page_file(self, page_no: int):
|
|
||||||
"""The root directory for usernames scraped from a single page."""
|
|
||||||
return os.path.join(self.path_pages_dir(), f"{page_no}.txt")
|
|
||||||
|
|
||||||
def log(self, msgs: List[Tuple[Union[AnsiColor, None], str]]):
|
|
||||||
transformed = []
|
|
||||||
for k, v in msgs:
|
|
||||||
if k is None:
|
|
||||||
transformed.append(v)
|
|
||||||
else:
|
|
||||||
transformed.append(f"{k.value}{v}{AnsiColor.RESET.value}")
|
|
||||||
|
|
||||||
print("".join(transformed))
|
|
||||||
|
|
Loading…
Reference in New Issue