229 lines
7.9 KiB
Python
229 lines
7.9 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
import os.path
|
|
from typing import List, Union
|
|
|
|
import aiohttp
|
|
from bs4 import BeautifulSoup, SoupStrainer
|
|
|
|
from app.exporter import BaseExporter
|
|
from app.repo import AnsiColor, Site
|
|
from app.scraper import BaseScraper
|
|
|
|
# The number of coach listing pages we will at most iterate through. This number
|
|
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
|
# traversing to the last page.
|
|
MAX_PAGES = 64
|
|
|
|
# How long to wait between a batch of network requests.
|
|
SLEEP_SECS = 3
|
|
|
|
|
|
class Scraper(BaseScraper):
|
|
def __init__(self, session: aiohttp.ClientSession):
|
|
super().__init__(site=Site.CHESSCOM.value, session=session)
|
|
|
|
async def download_usernames(self) -> List[str]:
|
|
"""Scan through chess.com/coaches for all coaches' usernames.
|
|
|
|
@return
|
|
The complete list of scraped usernames across every coach listing
|
|
page.
|
|
"""
|
|
usernames = []
|
|
for page_no in range(1, MAX_PAGES + 1):
|
|
filepath = self.path_page_file(page_no)
|
|
try:
|
|
with open(filepath, "r") as f:
|
|
self.log(
|
|
[
|
|
(AnsiColor.INFO, "[INFO]"),
|
|
(None, ": Reading file "),
|
|
(AnsiColor.DATA, filepath),
|
|
]
|
|
)
|
|
usernames.extend([line.strip() for line in f.readlines()])
|
|
except FileNotFoundError:
|
|
page_usernames = await self._scrape_page(page_no)
|
|
if not page_usernames:
|
|
self.log(
|
|
[
|
|
(AnsiColor.ERROR, "[ERROR]"),
|
|
(None, ": Could not scrape page "),
|
|
(AnsiColor.DATA, str(page_no)),
|
|
]
|
|
)
|
|
continue
|
|
with open(filepath, "w") as f:
|
|
for username in page_usernames:
|
|
f.write(f"{username}\n")
|
|
usernames.extend(page_usernames)
|
|
self.log(
|
|
[
|
|
(AnsiColor.INFO, "[INFO]"),
|
|
(None, ": Downloaded page "),
|
|
(AnsiColor.DATA, filepath),
|
|
]
|
|
)
|
|
await asyncio.sleep(SLEEP_SECS)
|
|
|
|
return usernames
|
|
|
|
async def _scrape_page(self, page_no: int) -> List[str]:
|
|
"""Scan through chess.com/coaches/?page=<n> for all coaches' usernames.
|
|
|
|
@param page_no
|
|
The page consisting of at most 25 coaches (at the time of writing)
|
|
whose usernames are to be scraped.
|
|
@return
|
|
The list of scraped usernames on the specified coach listing page.
|
|
"""
|
|
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
|
response, status_code = await self.request(url)
|
|
if response is None:
|
|
self.log(
|
|
[
|
|
(AnsiColor.ERROR, "[ERROR]"),
|
|
(None, ": Received status "),
|
|
(AnsiColor.DATA, f"{status_code} "),
|
|
(None, "when downloading page "),
|
|
(AnsiColor.DATA, str(page_no)),
|
|
]
|
|
)
|
|
return
|
|
|
|
usernames = []
|
|
soup = BeautifulSoup(response, "lxml")
|
|
members = soup.find_all("a", class_="members-categories-username")
|
|
for member in members:
|
|
href = member.get("href")
|
|
username = href[len("https://www.chess.com/member/") :]
|
|
usernames.append(username)
|
|
|
|
return usernames
|
|
|
|
async def download_profile(self, username: str):
|
|
"""For each coach, download coach-specific data.
|
|
|
|
This sends three parallel requests for:
|
|
* the coach's profile,
|
|
* the coach's recent activity,
|
|
* the coach's stats.
|
|
|
|
@param username
|
|
The coach username corresponding to the downloaded files.
|
|
"""
|
|
used_network = await asyncio.gather(
|
|
self._download_profile_file(
|
|
url=f"https://www.chess.com/member/{username}",
|
|
username=username,
|
|
filename=self.path_coach_file(username, f"{username}.html"),
|
|
),
|
|
self._download_profile_file(
|
|
url=f"https://www.chess.com/callback/member/activity/{username}?page=1",
|
|
username=username,
|
|
filename=self.path_coach_file(username, "activity.json"),
|
|
),
|
|
self._download_profile_file(
|
|
url=f"https://www.chess.com/callback/member/stats/{username}",
|
|
username=username,
|
|
filename=self.path_coach_file(username, "stats.json"),
|
|
),
|
|
)
|
|
if any(used_network):
|
|
self.log(
|
|
[
|
|
(AnsiColor.INFO, "[INFO]"),
|
|
(None, ": Downloaded data for coach "),
|
|
(AnsiColor.DATA, username),
|
|
]
|
|
)
|
|
await asyncio.sleep(SLEEP_SECS)
|
|
else:
|
|
self.log(
|
|
[
|
|
(AnsiColor.INFO, "[INFO]"),
|
|
(None, ": Skipping download for coach "),
|
|
(AnsiColor.DATA, username),
|
|
]
|
|
)
|
|
|
|
async def _download_profile_file(self, url: str, username: str, filename: str):
|
|
"""Writes the contents of url into the specified file.
|
|
|
|
@param url
|
|
The URL of the file to download.
|
|
@param username
|
|
The coach username corresponding to the downloaded file.
|
|
@param filename
|
|
The output file to write the downloaded content to.
|
|
@return:
|
|
True if we make a network request. False otherwise.
|
|
"""
|
|
if os.path.isfile(filename):
|
|
return False
|
|
|
|
response, _unused_status = await self.request(url)
|
|
if response is not None:
|
|
with open(filename, "w") as f:
|
|
f.write(response)
|
|
|
|
return True
|
|
|
|
|
|
def _profile_filter(elem, attrs):
|
|
"""Includes only relevant segments of the `{username}.html` file."""
|
|
if "profile-header-info" in attrs.get("class", ""):
|
|
return True
|
|
if "profile-card-info" in attrs.get("class", ""):
|
|
return True
|
|
|
|
|
|
class Exporter(BaseExporter):
|
|
def __init__(self, username: str):
|
|
super().__init__(site=Site.CHESSCOM.value, username=username)
|
|
|
|
self.profile_soup = None
|
|
try:
|
|
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
|
|
self.profile_soup = BeautifulSoup(
|
|
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
|
|
)
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
self.stats_json = {}
|
|
try:
|
|
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
|
for s in json.load(f).get("stats", []):
|
|
if "key" in s and "stats" in s:
|
|
self.stats_json[s["key"]] = s["stats"]
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
def export_name(self) -> Union[str, None]:
|
|
try:
|
|
name = self.profile_soup.find("div", class_="profile-card-name")
|
|
return name.get_text().strip()
|
|
except AttributeError:
|
|
return None
|
|
|
|
def export_image_url(self) -> Union[str, None]:
|
|
try:
|
|
div = self.profile_soup.find("div", class_="profile-header-avatar")
|
|
src = div.find("img").get("src", "")
|
|
if "images.chesscomfiles.com" in src:
|
|
return src
|
|
except AttributeError:
|
|
return None
|
|
|
|
def export_rapid(self) -> Union[int, None]:
|
|
return self.stats_json.get("rapid", {}).get("rating")
|
|
|
|
def export_blitz(self) -> Union[int, None]:
|
|
return self.stats_json.get("lightning", {}).get("rating")
|
|
|
|
def export_bullet(self) -> Union[int, None]:
|
|
return self.stats_json.get("bullet", {}).get("rating")
|