coach-scraper/app/chesscom.py

196 lines
6.8 KiB
Python
Raw Normal View History

import asyncio
import json
import os
import os.path
2023-12-01 23:36:53 +00:00
from typing import List, Union
2023-12-01 23:36:53 +00:00
import aiohttp
from bs4 import BeautifulSoup
2023-12-01 23:36:53 +00:00
from app.exporter import BaseExporter
from app.repo import AnsiColor, Site
from app.scraper import BaseScraper
# The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
# traversing to the last page.
MAX_PAGES = 64
# How long to wait between a batch of network requests.
SLEEP_SECS = 3
class Scraper(BaseScraper):
def __init__(self, session: aiohttp.ClientSession):
super().__init__(site=Site.CHESSCOM.value, session=session)
async def download_usernames(self) -> List[str]:
"""Scan through chess.com/coaches for all coaches' usernames.
@return
The complete list of scraped usernames across every coach listing
page.
"""
usernames = []
for page_no in range(1, MAX_PAGES + 1):
filepath = self.path_page_file(page_no)
try:
with open(filepath, "r") as f:
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Reading file "),
(AnsiColor.DATA, filepath),
]
)
usernames.extend([line.strip() for line in f.readlines()])
except FileNotFoundError:
page_usernames = await self._scrape_page(page_no)
if not page_usernames:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Could not scrape page "),
(AnsiColor.DATA, str(page_no)),
]
)
continue
with open(filepath, "w") as f:
for username in page_usernames:
f.write(f"{username}\n")
usernames.extend(page_usernames)
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded page "),
(AnsiColor.DATA, filepath),
]
)
await asyncio.sleep(SLEEP_SECS)
return usernames
async def _scrape_page(self, page_no: int) -> List[str]:
"""Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
@param page_no
The page consisting of at most 25 coaches (at the time of writing)
whose usernames are to be scraped.
@return
The list of scraped usernames on the specified coach listing page.
"""
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
response, status_code = await self.request(url)
if response is None:
self.log(
[
(AnsiColor.ERROR, "[ERROR]"),
(None, ": Received status "),
(AnsiColor.DATA, f"{status_code} "),
(None, "when downloading page "),
(AnsiColor.DATA, str(page_no)),
]
)
return
usernames = []
2023-12-01 14:12:40 +00:00
soup = BeautifulSoup(response, "lxml")
members = soup.find_all("a", class_="members-categories-username")
for member in members:
href = member.get("href")
username = href[len("https://www.chess.com/member/") :]
usernames.append(username)
return usernames
async def download_profile(self, username: str):
"""For each coach, download coach-specific data.
This sends three parallel requests for:
* the coach's profile,
* the coach's recent activity,
* the coach's stats.
@param username
The coach username corresponding to the downloaded files.
"""
used_network = await asyncio.gather(
self._download_profile_file(
url=f"https://www.chess.com/member/{username}",
username=username,
filename=self.path_coach_file(username, f"{username}.html"),
),
self._download_profile_file(
url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
username=username,
filename=self.path_coach_file(username, "activity.json"),
),
self._download_profile_file(
url=f"https://www.chess.com/callback/member/stats/{username}",
username=username,
filename=self.path_coach_file(username, "stats.json"),
),
)
if any(used_network):
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Downloaded data for coach "),
(AnsiColor.DATA, username),
]
)
await asyncio.sleep(SLEEP_SECS)
else:
self.log(
[
(AnsiColor.INFO, "[INFO]"),
(None, ": Skipping download for coach "),
(AnsiColor.DATA, username),
]
)
async def _download_profile_file(self, url: str, username: str, filename: str):
"""Writes the contents of url into the specified file.
@param url
The URL of the file to download.
@param username
The coach username corresponding to the downloaded file.
@param filename
The output file to write the downloaded content to.
@return:
True if we make a network request. False otherwise.
"""
if os.path.isfile(filename):
return False
response, _unused_status = await self.request(url)
if response is not None:
with open(filename, "w") as f:
f.write(response)
return True
2023-12-01 00:12:16 +00:00
class Exporter(BaseExporter):
def __init__(self, username: str):
super().__init__(site=Site.CHESSCOM.value, username=username)
2023-11-30 22:36:44 +00:00
2023-12-01 00:12:16 +00:00
self.stats_json = {}
try:
with open(self.path_coach_file(username, "stats.json"), "r") as f:
2023-12-01 00:12:16 +00:00
for s in json.load(f).get("stats", []):
if "key" in s and "stats" in s:
self.stats_json[s["key"]] = s["stats"]
except FileNotFoundError:
pass
2023-12-01 03:35:20 +00:00
def export_rapid(self) -> Union[int, None]:
2023-12-01 00:12:16 +00:00
return self.stats_json.get("rapid", {}).get("rating")
2023-12-01 14:10:58 +00:00
def export_blitz(self) -> Union[int, None]:
return self.stats_json.get("lightning", {}).get("rating")
def export_bullet(self) -> Union[int, None]:
return self.stats_json.get("bullet", {}).get("rating")