2023-11-30 22:15:15 +00:00
|
|
|
import asyncio
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import os.path
|
2023-12-05 19:54:12 +00:00
|
|
|
from typing import List
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-01 23:36:53 +00:00
|
|
|
import aiohttp
|
2023-12-05 19:54:12 +00:00
|
|
|
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
2023-12-07 03:53:54 +00:00
|
|
|
from lingua import LanguageDetector
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
from app.locale import Locale, lang_to_locale
|
2023-12-05 18:43:13 +00:00
|
|
|
from app.pipeline import Extractor as BaseExtractor
|
|
|
|
from app.pipeline import Fetcher as BaseFetcher
|
|
|
|
from app.pipeline import Pipeline as BasePipeline
|
2023-12-07 02:52:40 +00:00
|
|
|
from app.types import Site, Title
|
2023-11-30 22:15:15 +00:00
|
|
|
|
|
|
|
# The number of coach listing pages we will at most iterate through. This number
|
|
|
|
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
|
|
|
# traversing to the last page.
|
|
|
|
MAX_PAGES = 64
|
|
|
|
|
|
|
|
# How long to wait between a batch of network requests.
|
|
|
|
SLEEP_SECS = 3
|
|
|
|
|
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
class Fetcher(BaseFetcher):
|
2023-11-30 22:15:15 +00:00
|
|
|
def __init__(self, session: aiohttp.ClientSession):
|
2023-12-05 18:43:13 +00:00
|
|
|
super().__init__(site=Site.CHESSCOM, session=session)
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
async def scrape_usernames(self, page_no: int) -> List[str] | None:
|
2023-12-05 18:43:13 +00:00
|
|
|
if page_no > MAX_PAGES:
|
|
|
|
return []
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
print(f"{self.site.value}: Scraping page {page_no}/{MAX_PAGES}")
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
filepath = self.path_page_file(page_no)
|
|
|
|
try:
|
|
|
|
with open(filepath, "r") as f:
|
|
|
|
return [line.strip() for line in f.readlines()]
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
2023-11-30 22:15:15 +00:00
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
if self.has_made_request:
|
|
|
|
await asyncio.sleep(SLEEP_SECS)
|
2023-11-30 22:15:15 +00:00
|
|
|
|
|
|
|
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
2023-12-05 18:43:13 +00:00
|
|
|
response, status_code = await self.fetch(url)
|
2023-11-30 22:15:15 +00:00
|
|
|
if response is None:
|
2023-12-05 18:43:13 +00:00
|
|
|
return None # Skips this page.
|
2023-11-30 22:15:15 +00:00
|
|
|
|
|
|
|
usernames = []
|
2023-12-01 14:12:40 +00:00
|
|
|
soup = BeautifulSoup(response, "lxml")
|
2023-11-30 22:15:15 +00:00
|
|
|
members = soup.find_all("a", class_="members-categories-username")
|
|
|
|
for member in members:
|
|
|
|
href = member.get("href")
|
|
|
|
username = href[len("https://www.chess.com/member/") :]
|
|
|
|
usernames.append(username)
|
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
# Cache results.
|
|
|
|
with open(filepath, "w") as f:
|
|
|
|
for username in usernames:
|
|
|
|
f.write(f"{username}\n")
|
|
|
|
|
2023-11-30 22:15:15 +00:00
|
|
|
return usernames
|
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
async def download_user_files(self, username: str) -> None:
|
|
|
|
maybe_download = [
|
|
|
|
(
|
|
|
|
f"https://www.chess.com/member/{username}",
|
|
|
|
self.path_coach_file(username, f"{username}.html"),
|
2023-11-30 22:15:15 +00:00
|
|
|
),
|
2023-12-05 18:43:13 +00:00
|
|
|
(
|
|
|
|
f"https://www.chess.com/callback/member/stats/{username}",
|
|
|
|
self.path_coach_file(username, "stats.json"),
|
2023-11-30 22:15:15 +00:00
|
|
|
),
|
2023-12-05 18:43:13 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
to_download = []
|
|
|
|
for d_url, d_filename in maybe_download:
|
|
|
|
if os.path.isfile(d_filename):
|
|
|
|
continue
|
|
|
|
to_download.append((d_url, d_filename))
|
|
|
|
|
|
|
|
if not to_download:
|
|
|
|
return
|
|
|
|
|
|
|
|
if self.has_made_request:
|
2023-11-30 22:15:15 +00:00
|
|
|
await asyncio.sleep(SLEEP_SECS)
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
await asyncio.gather(
|
|
|
|
*[self._download_file(url=d[0], filename=d[1]) for d in to_download]
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _download_file(self, url: str, filename: str) -> None:
|
|
|
|
response, _unused_status = await self.fetch(url)
|
2023-11-30 22:15:15 +00:00
|
|
|
if response is not None:
|
|
|
|
with open(filename, "w") as f:
|
|
|
|
f.write(response)
|
|
|
|
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
2023-12-07 03:53:54 +00:00
|
|
|
for className in [
|
|
|
|
"profile-header-info",
|
|
|
|
"profile-card-info",
|
|
|
|
"profile-about",
|
|
|
|
]:
|
|
|
|
if className in attrs.get("class", ""):
|
|
|
|
return True
|
2023-12-05 19:54:12 +00:00
|
|
|
return False
|
2023-12-04 22:08:17 +00:00
|
|
|
|
|
|
|
|
2023-12-05 18:43:13 +00:00
|
|
|
class Extractor(BaseExtractor):
|
2023-12-07 03:53:54 +00:00
|
|
|
def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
|
|
|
|
super().__init__(fetcher, detector, username)
|
2023-11-30 22:36:44 +00:00
|
|
|
|
2023-12-04 22:08:17 +00:00
|
|
|
self.profile_soup = None
|
|
|
|
try:
|
2023-12-05 18:43:13 +00:00
|
|
|
filename = self.fetcher.path_coach_file(username, f"{username}.html")
|
|
|
|
with open(filename, "r") as f:
|
2023-12-04 22:08:17 +00:00
|
|
|
self.profile_soup = BeautifulSoup(
|
|
|
|
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
|
|
|
|
)
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
|
2023-12-01 00:12:16 +00:00
|
|
|
self.stats_json = {}
|
2023-11-30 22:15:15 +00:00
|
|
|
try:
|
2023-12-05 18:43:13 +00:00
|
|
|
filename = self.fetcher.path_coach_file(username, "stats.json")
|
|
|
|
with open(filename, "r") as f:
|
2023-12-01 00:12:16 +00:00
|
|
|
for s in json.load(f).get("stats", []):
|
|
|
|
if "key" in s and "stats" in s:
|
|
|
|
self.stats_json[s["key"]] = s["stats"]
|
2023-11-30 22:15:15 +00:00
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_name(self) -> str | None:
|
|
|
|
if self.profile_soup is None:
|
|
|
|
return None
|
|
|
|
name = self.profile_soup.find("div", class_="profile-card-name")
|
|
|
|
if not isinstance(name, Tag):
|
2023-12-04 22:08:17 +00:00
|
|
|
return None
|
2023-12-05 19:54:12 +00:00
|
|
|
return name.get_text().strip()
|
2023-12-04 22:08:17 +00:00
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_image_url(self) -> str | None:
|
|
|
|
if self.profile_soup is None:
|
|
|
|
return None
|
|
|
|
div = self.profile_soup.find("div", class_="profile-header-avatar")
|
|
|
|
if not isinstance(div, Tag):
|
|
|
|
return None
|
|
|
|
img = div.find("img")
|
|
|
|
if not isinstance(img, Tag):
|
|
|
|
return None
|
|
|
|
src = img.get("src", "")
|
|
|
|
if not isinstance(src, str):
|
|
|
|
return None
|
|
|
|
if "images.chesscomfiles.com" not in src:
|
2023-12-04 22:08:17 +00:00
|
|
|
return None
|
2023-12-05 19:54:12 +00:00
|
|
|
return src
|
2023-12-04 22:08:17 +00:00
|
|
|
|
2023-12-07 02:52:40 +00:00
|
|
|
def get_title(self) -> Title | None:
|
|
|
|
if self.profile_soup is None:
|
|
|
|
return None
|
|
|
|
a = self.profile_soup.find("a", class_="profile-card-chesstitle")
|
|
|
|
if not isinstance(a, Tag):
|
|
|
|
return None
|
|
|
|
title = a.get_text().strip()
|
|
|
|
try:
|
|
|
|
return Title(title)
|
|
|
|
except ValueError:
|
|
|
|
return None
|
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
def get_languages(self) -> List[Locale] | None:
|
|
|
|
if self.profile_soup is None:
|
|
|
|
return None
|
|
|
|
about = self.profile_soup.find("div", class_="profile-about")
|
|
|
|
if not isinstance(about, Tag):
|
|
|
|
return None
|
|
|
|
detected = self.detector.detect_language_of(about.text)
|
|
|
|
if detected is None:
|
|
|
|
return None
|
|
|
|
code = lang_to_locale.get(detected)
|
|
|
|
if code is None:
|
|
|
|
return None
|
|
|
|
return [code]
|
2023-12-05 21:20:46 +00:00
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_rapid(self) -> int | None:
|
2023-12-01 00:12:16 +00:00
|
|
|
return self.stats_json.get("rapid", {}).get("rating")
|
2023-12-01 14:10:58 +00:00
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_blitz(self) -> int | None:
|
2023-12-01 14:10:58 +00:00
|
|
|
return self.stats_json.get("lightning", {}).get("rating")
|
|
|
|
|
2023-12-05 19:54:12 +00:00
|
|
|
def get_bullet(self) -> int | None:
|
2023-12-01 14:10:58 +00:00
|
|
|
return self.stats_json.get("bullet", {}).get("rating")
|
2023-12-05 18:43:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Pipeline(BasePipeline):
|
|
|
|
def get_fetcher(self, session: aiohttp.ClientSession):
|
|
|
|
return Fetcher(session)
|
|
|
|
|
2023-12-07 03:53:54 +00:00
|
|
|
def get_extractor(
|
|
|
|
self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
|
|
|
|
):
|
|
|
|
return Extractor(fetcher, detector, username)
|