coach-scraper/app/chesscom.py

import asyncio
import json
import os
import os.path
from typing import List

import aiohttp
from bs4 import BeautifulSoup, SoupStrainer, Tag
from lingua import LanguageDetector

from app.locale import Locale, lang_to_locale
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.types import Site, Title

# The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
# traversing to the last page.
MAX_PAGES = 64

# How long to wait between a batch of network requests.
SLEEP_SECS = 3


class Fetcher(BaseFetcher):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.CHESSCOM, session=session)

    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        if page_no > MAX_PAGES:
            return []

        print(f"{self.site.value}: Scraping page {page_no}/{MAX_PAGES}")

        filepath = self.path_page_file(page_no)
        try:
            with open(filepath, "r") as f:
                return [line.strip() for line in f.readlines()]
        except FileNotFoundError:
            pass

        if self.has_made_request:
            await asyncio.sleep(SLEEP_SECS)

        url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
        response, status_code = await self.fetch(url)
        if response is None:
            return None  # Skips this page.

        usernames = []
        soup = BeautifulSoup(response, "lxml")
        members = soup.find_all("a", class_="members-categories-username")
        for member in members:
            href = member.get("href")
            username = href[len("https://www.chess.com/member/") :]
            usernames.append(username)

        # Cache results.
        with open(filepath, "w") as f:
            for username in usernames:
                f.write(f"{username}\n")

        return usernames

    async def download_user_files(self, username: str) -> None:
        maybe_download = [
            (
                f"https://www.chess.com/member/{username}",
                self.path_coach_file(username, f"{username}.html"),
            ),
            (
                f"https://www.chess.com/callback/member/stats/{username}",
                self.path_coach_file(username, "stats.json"),
            ),
        ]

        to_download = []
        for d_url, d_filename in maybe_download:
            if os.path.isfile(d_filename):
                continue
            to_download.append((d_url, d_filename))

        if not to_download:
            return

        if self.has_made_request:
            await asyncio.sleep(SLEEP_SECS)

        await asyncio.gather(
            *[self._download_file(url=d[0], filename=d[1]) for d in to_download]
        )

    async def _download_file(self, url: str, filename: str) -> None:
        response, _unused_status = await self.fetch(url)
        if response is not None:
            with open(filename, "w") as f:
                f.write(response)


def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
    for className in [
        "profile-header-info",
        "profile-card-info",
        "profile-about",
    ]:
        if className in attrs.get("class", ""):
            return True
    return False


class Extractor(BaseExtractor):
    def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
        super().__init__(fetcher, detector, username)

        self.profile_soup = None
        try:
            filename = self.fetcher.path_coach_file(username, f"{username}.html")
            with open(filename, "r") as f:
                self.profile_soup = BeautifulSoup(
                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                )
        except FileNotFoundError:
            pass

        self.stats_json = {}
        try:
            filename = self.fetcher.path_coach_file(username, "stats.json")
            with open(filename, "r") as f:
                for s in json.load(f).get("stats", []):
                    if "key" in s and "stats" in s:
                        self.stats_json[s["key"]] = s["stats"]
        except FileNotFoundError:
            pass

    def get_name(self) -> str | None:
        if self.profile_soup is None:
            return None
        name = self.profile_soup.find("div", class_="profile-card-name")
        if not isinstance(name, Tag):
            return None
        return name.get_text().strip()

    def get_image_url(self) -> str | None:
        if self.profile_soup is None:
            return None
        div = self.profile_soup.find("div", class_="profile-header-avatar")
        if not isinstance(div, Tag):
            return None
        img = div.find("img")
        if not isinstance(img, Tag):
            return None
        src = img.get("src", "")
        if not isinstance(src, str):
            return None
        if "images.chesscomfiles.com" not in src:
            return None
        return src

    def get_title(self) -> Title | None:
        if self.profile_soup is None:
            return None
        a = self.profile_soup.find("a", class_="profile-card-chesstitle")
        if not isinstance(a, Tag):
            return None
        title = a.get_text().strip()
        try:
            return Title(title)
        except ValueError:
            return None

    def get_languages(self) -> List[Locale] | None:
        if self.profile_soup is None:
            return None
        about = self.profile_soup.find("div", class_="profile-about")
        if not isinstance(about, Tag):
            return None
        detected = self.detector.detect_language_of(about.text)
        if detected is None:
            return None
        code = lang_to_locale.get(detected)
        if code is None:
            return None
        return [code]

    def get_rapid(self) -> int | None:
        return self.stats_json.get("rapid", {}).get("rating")

    def get_blitz(self) -> int | None:
        return self.stats_json.get("lightning", {}).get("rating")

    def get_bullet(self) -> int | None:
        return self.stats_json.get("bullet", {}).get("rating")


class Pipeline(BasePipeline):
    def get_fetcher(self, session: aiohttp.ClientSession):
        return Fetcher(session)

    def get_extractor(
        self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
    ):
        return Extractor(fetcher, detector, username)
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`import asyncio`
			`import json`
			`import os`
			`import os.path`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`from typing import List`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Apply pyls-isort. 2023-12-01 23:36:53 +00:00			`import aiohttp`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`from bs4 import BeautifulSoup, SoupStrainer, Tag`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`from lingua import LanguageDetector`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`from app.locale import Locale, lang_to_locale`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`from app.pipeline import Extractor as BaseExtractor`
			`from app.pipeline import Fetcher as BaseFetcher`
			`from app.pipeline import Pipeline as BasePipeline`
Scrape titles. 2023-12-07 02:52:40 +00:00			`from app.types import Site, Title`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
			`# The number of coach listing pages we will at most iterate through. This number`
			`# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and`
			`# traversing to the last page.`
			`MAX_PAGES = 64`

			`# How long to wait between a batch of network requests.`
			`SLEEP_SECS = 3`


Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`class Fetcher(BaseFetcher):`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`def __init__(self, session: aiohttp.ClientSession):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`super().__init__(site=Site.CHESSCOM, session=session)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`async def scrape_usernames(self, page_no: int) -> List[str] \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`if page_no > MAX_PAGES:`
			`return []`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`print(f"{self.site.value}: Scraping page {page_no}/{MAX_PAGES}")`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`filepath = self.path_page_file(page_no)`
			`try:`
			`with open(filepath, "r") as f:`
			`return [line.strip() for line in f.readlines()]`
			`except FileNotFoundError:`
			`pass`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`if self.has_made_request:`
			`await asyncio.sleep(SLEEP_SECS)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
			`url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`response, status_code = await self.fetch(url)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`if response is None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`return None # Skips this page.`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
			`usernames = []`
Use lxml to speed up parsing. 2023-12-01 14:12:40 +00:00			`soup = BeautifulSoup(response, "lxml")`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`members = soup.find_all("a", class_="members-categories-username")`
			`for member in members:`
			`href = member.get("href")`
			`username = href[len("https://www.chess.com/member/") :]`
			`usernames.append(username)`

Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`# Cache results.`
			`with open(filepath, "w") as f:`
			`for username in usernames:`
			`f.write(f"{username}\n")`

Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`return usernames`

Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`async def download_user_files(self, username: str) -> None:`
			`maybe_download = [`
			`(`
			`f"https://www.chess.com/member/{username}",`
			`self.path_coach_file(username, f"{username}.html"),`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`),`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`(`
			`f"https://www.chess.com/callback/member/stats/{username}",`
			`self.path_coach_file(username, "stats.json"),`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`),`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`]`

			`to_download = []`
			`for d_url, d_filename in maybe_download:`
			`if os.path.isfile(d_filename):`
			`continue`
			`to_download.append((d_url, d_filename))`

			`if not to_download:`
			`return`

			`if self.has_made_request:`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`await asyncio.sleep(SLEEP_SECS)`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00
			`await asyncio.gather(`
			`*[self._download_file(url=d[0], filename=d[1]) for d in to_download]`
			`)`

			`async def _download_file(self, url: str, filename: str) -> None:`
			`response, _unused_status = await self.fetch(url)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`if response is not None:`
			`with open(filename, "w") as f:`
			`f.write(response)`


Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def _profile_filter(elem: Tag \| str \| None, attrs={}) -> bool:`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`for className in [`
			`"profile-header-info",`
			`"profile-card-info",`
			`"profile-about",`
			`]:`
			`if className in attrs.get("class", ""):`
			`return True`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`return False`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00

Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`class Extractor(BaseExtractor):`
Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):`
			`super().__init__(fetcher, detector, username)`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`self.profile_soup = None`
			`try:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`filename = self.fetcher.path_coach_file(username, f"{username}.html")`
			`with open(filename, "r") as f:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`self.profile_soup = BeautifulSoup(`
			`f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)`
			`)`
			`except FileNotFoundError:`
			`pass`

Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`self.stats_json = {}`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`try:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`filename = self.fetcher.path_coach_file(username, "stats.json")`
			`with open(filename, "r") as f:`
Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`for s in json.load(f).get("stats", []):`
			`if "key" in s and "stats" in s:`
			`self.stats_json[s["key"]] = s["stats"]`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`except FileNotFoundError:`
			`pass`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_name(self) -> str \| None:`
			`if self.profile_soup is None:`
			`return None`
			`name = self.profile_soup.find("div", class_="profile-card-name")`
			`if not isinstance(name, Tag):`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`return None`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`return name.get_text().strip()`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_image_url(self) -> str \| None:`
			`if self.profile_soup is None:`
			`return None`
			`div = self.profile_soup.find("div", class_="profile-header-avatar")`
			`if not isinstance(div, Tag):`
			`return None`
			`img = div.find("img")`
			`if not isinstance(img, Tag):`
			`return None`
			`src = img.get("src", "")`
			`if not isinstance(src, str):`
			`return None`
			`if "images.chesscomfiles.com" not in src:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`return None`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`return src`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00
Scrape titles. 2023-12-07 02:52:40 +00:00			`def get_title(self) -> Title \| None:`
			`if self.profile_soup is None:`
			`return None`
			`a = self.profile_soup.find("a", class_="profile-card-chesstitle")`
			`if not isinstance(a, Tag):`
			`return None`
			`title = a.get_text().strip()`
			`try:`
			`return Title(title)`
			`except ValueError:`
			`return None`

Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`def get_languages(self) -> List[Locale] \| None:`
			`if self.profile_soup is None:`
			`return None`
			`about = self.profile_soup.find("div", class_="profile-about")`
			`if not isinstance(about, Tag):`
			`return None`
			`detected = self.detector.detect_language_of(about.text)`
			`if detected is None:`
			`return None`
			`code = lang_to_locale.get(detected)`
			`if code is None:`
			`return None`
			`return [code]`
Scrape languages from lichess listing. (#10) 2023-12-05 21:20:46 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_rapid(self) -> int \| None:`
Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`return self.stats_json.get("rapid", {}).get("rating")`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_blitz(self) -> int \| None:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self.stats_json.get("lightning", {}).get("rating")`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_bullet(self) -> int \| None:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self.stats_json.get("bullet", {}).get("rating")`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00

			`class Pipeline(BasePipeline):`
			`def get_fetcher(self, session: aiohttp.ClientSession):`
			`return Fetcher(session)`

Add language detection for chesscom profiles. 2023-12-07 03:53:54 +00:00			`def get_extractor(`
			`self, fetcher: BaseFetcher, detector: LanguageDetector, username: str`
			`):`
			`return Extractor(fetcher, detector, username)`