coach-scraper/app/chesscom.py

import asyncio
import json
import os
import os.path
from typing import List

import aiohttp
from bs4 import BeautifulSoup, SoupStrainer, Tag

from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.types import Site

# The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
# traversing to the last page.
MAX_PAGES = 64

# How long to wait between a batch of network requests.
SLEEP_SECS = 3


class Fetcher(BaseFetcher):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.CHESSCOM, session=session)

    async def scrape_usernames(self, page_no: int) -> List[str] | None:
        if page_no > MAX_PAGES:
            return []

        print(f"{self.site.value}: Scraping page {page_no}/{MAX_PAGES}")

        filepath = self.path_page_file(page_no)
        try:
            with open(filepath, "r") as f:
                return [line.strip() for line in f.readlines()]
        except FileNotFoundError:
            pass

        if self.has_made_request:
            await asyncio.sleep(SLEEP_SECS)

        url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
        response, status_code = await self.fetch(url)
        if response is None:
            return None  # Skips this page.

        usernames = []
        soup = BeautifulSoup(response, "lxml")
        members = soup.find_all("a", class_="members-categories-username")
        for member in members:
            href = member.get("href")
            username = href[len("https://www.chess.com/member/") :]
            usernames.append(username)

        # Cache results.
        with open(filepath, "w") as f:
            for username in usernames:
                f.write(f"{username}\n")

        return usernames

    async def download_user_files(self, username: str) -> None:
        maybe_download = [
            (
                f"https://www.chess.com/member/{username}",
                self.path_coach_file(username, f"{username}.html"),
            ),
            (
                f"https://www.chess.com/callback/member/activity/{username}?page=1",
                self.path_coach_file(username, "activity.json"),
            ),
            (
                f"https://www.chess.com/callback/member/stats/{username}",
                self.path_coach_file(username, "stats.json"),
            ),
        ]

        to_download = []
        for d_url, d_filename in maybe_download:
            if os.path.isfile(d_filename):
                continue
            to_download.append((d_url, d_filename))

        if not to_download:
            return

        if self.has_made_request:
            await asyncio.sleep(SLEEP_SECS)

        await asyncio.gather(
            *[self._download_file(url=d[0], filename=d[1]) for d in to_download]
        )

    async def _download_file(self, url: str, filename: str) -> None:
        response, _unused_status = await self.fetch(url)
        if response is not None:
            with open(filename, "w") as f:
                f.write(response)


def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
    if "profile-header-info" in attrs.get("class", ""):
        return True
    if "profile-card-info" in attrs.get("class", ""):
        return True
    return False


class Extractor(BaseExtractor):
    def __init__(self, fetcher: BaseFetcher, username: str):
        super().__init__(fetcher, username)

        self.profile_soup = None
        try:
            filename = self.fetcher.path_coach_file(username, f"{username}.html")
            with open(filename, "r") as f:
                self.profile_soup = BeautifulSoup(
                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                )
        except FileNotFoundError:
            pass

        self.stats_json = {}
        try:
            filename = self.fetcher.path_coach_file(username, "stats.json")
            with open(filename, "r") as f:
                for s in json.load(f).get("stats", []):
                    if "key" in s and "stats" in s:
                        self.stats_json[s["key"]] = s["stats"]
        except FileNotFoundError:
            pass

    def get_name(self) -> str | None:
        if self.profile_soup is None:
            return None
        name = self.profile_soup.find("div", class_="profile-card-name")
        if not isinstance(name, Tag):
            return None
        return name.get_text().strip()

    def get_image_url(self) -> str | None:
        if self.profile_soup is None:
            return None
        div = self.profile_soup.find("div", class_="profile-header-avatar")
        if not isinstance(div, Tag):
            return None
        img = div.find("img")
        if not isinstance(img, Tag):
            return None
        src = img.get("src", "")
        if not isinstance(src, str):
            return None
        if "images.chesscomfiles.com" not in src:
            return None
        return src

    def get_languages(self) -> List[str] | None:
        # TODO: Extract using huggingface model.
        return None

    def get_rapid(self) -> int | None:
        return self.stats_json.get("rapid", {}).get("rating")

    def get_blitz(self) -> int | None:
        return self.stats_json.get("lightning", {}).get("rating")

    def get_bullet(self) -> int | None:
        return self.stats_json.get("bullet", {}).get("rating")


class Pipeline(BasePipeline):
    def get_fetcher(self, session: aiohttp.ClientSession):
        return Fetcher(session)

    def get_extractor(self, fetcher: BaseFetcher, username: str):
        return Extractor(fetcher, username)
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`import asyncio`
			`import json`
			`import os`
			`import os.path`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`from typing import List`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Apply pyls-isort. 2023-12-01 23:36:53 +00:00			`import aiohttp`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`from bs4 import BeautifulSoup, SoupStrainer, Tag`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`from app.pipeline import Extractor as BaseExtractor`
			`from app.pipeline import Fetcher as BaseFetcher`
			`from app.pipeline import Pipeline as BasePipeline`
Scrape languages from lichess listing. (#10) 2023-12-05 21:20:46 +00:00			`from app.types import Site`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
			`# The number of coach listing pages we will at most iterate through. This number`
			`# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and`
			`# traversing to the last page.`
			`MAX_PAGES = 64`

			`# How long to wait between a batch of network requests.`
			`SLEEP_SECS = 3`


Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`class Fetcher(BaseFetcher):`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`def __init__(self, session: aiohttp.ClientSession):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`super().__init__(site=Site.CHESSCOM, session=session)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`async def scrape_usernames(self, page_no: int) -> List[str] \| None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`if page_no > MAX_PAGES:`
			`return []`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`print(f"{self.site.value}: Scraping page {page_no}/{MAX_PAGES}")`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`filepath = self.path_page_file(page_no)`
			`try:`
			`with open(filepath, "r") as f:`
			`return [line.strip() for line in f.readlines()]`
			`except FileNotFoundError:`
			`pass`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`if self.has_made_request:`
			`await asyncio.sleep(SLEEP_SECS)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
			`url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`response, status_code = await self.fetch(url)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`if response is None:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`return None # Skips this page.`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00
			`usernames = []`
Use lxml to speed up parsing. 2023-12-01 14:12:40 +00:00			`soup = BeautifulSoup(response, "lxml")`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`members = soup.find_all("a", class_="members-categories-username")`
			`for member in members:`
			`href = member.get("href")`
			`username = href[len("https://www.chess.com/member/") :]`
			`usernames.append(username)`

Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`# Cache results.`
			`with open(filepath, "w") as f:`
			`for username in usernames:`
			`f.write(f"{username}\n")`

Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`return usernames`

Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`async def download_user_files(self, username: str) -> None:`
			`maybe_download = [`
			`(`
			`f"https://www.chess.com/member/{username}",`
			`self.path_coach_file(username, f"{username}.html"),`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`),`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`(`
			`f"https://www.chess.com/callback/member/activity/{username}?page=1",`
			`self.path_coach_file(username, "activity.json"),`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`),`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`(`
			`f"https://www.chess.com/callback/member/stats/{username}",`
			`self.path_coach_file(username, "stats.json"),`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`),`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`]`

			`to_download = []`
			`for d_url, d_filename in maybe_download:`
			`if os.path.isfile(d_filename):`
			`continue`
			`to_download.append((d_url, d_filename))`

			`if not to_download:`
			`return`

			`if self.has_made_request:`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`await asyncio.sleep(SLEEP_SECS)`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00
			`await asyncio.gather(`
			`*[self._download_file(url=d[0], filename=d[1]) for d in to_download]`
			`)`

			`async def _download_file(self, url: str, filename: str) -> None:`
			`response, _unused_status = await self.fetch(url)`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`if response is not None:`
			`with open(filename, "w") as f:`
			`f.write(response)`


Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def _profile_filter(elem: Tag \| str \| None, attrs={}) -> bool:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`if "profile-header-info" in attrs.get("class", ""):`
			`return True`
			`if "profile-card-info" in attrs.get("class", ""):`
			`return True`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`return False`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00

Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`class Extractor(BaseExtractor):`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def __init__(self, fetcher: BaseFetcher, username: str):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`super().__init__(fetcher, username)`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`self.profile_soup = None`
			`try:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`filename = self.fetcher.path_coach_file(username, f"{username}.html")`
			`with open(filename, "r") as f:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`self.profile_soup = BeautifulSoup(`
			`f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)`
			`)`
			`except FileNotFoundError:`
			`pass`

Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`self.stats_json = {}`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`try:`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`filename = self.fetcher.path_coach_file(username, "stats.json")`
			`with open(filename, "r") as f:`
Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`for s in json.load(f).get("stats", []):`
			`if "key" in s and "stats" in s:`
			`self.stats_json[s["key"]] = s["stats"]`
Generalize in anticipation of merging the lichess scraper. (#1) * Add a general `Scraper` class. * Setup main as primary entrypoint. * Abstract original scraper into scraper class. * Add better logging and cleaner bash commands. * Ensure exporting works. 2023-11-30 22:15:15 +00:00			`except FileNotFoundError:`
			`pass`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_name(self) -> str \| None:`
			`if self.profile_soup is None:`
			`return None`
			`name = self.profile_soup.find("div", class_="profile-card-name")`
			`if not isinstance(name, Tag):`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`return None`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`return name.get_text().strip()`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_image_url(self) -> str \| None:`
			`if self.profile_soup is None:`
			`return None`
			`div = self.profile_soup.find("div", class_="profile-header-avatar")`
			`if not isinstance(div, Tag):`
			`return None`
			`img = div.find("img")`
			`if not isinstance(img, Tag):`
			`return None`
			`src = img.get("src", "")`
			`if not isinstance(src, str):`
			`return None`
			`if "images.chesscomfiles.com" not in src:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`return None`
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`return src`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00
Scrape languages from lichess listing. (#10) 2023-12-05 21:20:46 +00:00			`def get_languages(self) -> List[str] \| None:`
			`# TODO: Extract using huggingface model.`
			`return None`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_rapid(self) -> int \| None:`
Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`return self.stats_json.get("rapid", {}).get("rating")`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00
Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_blitz(self) -> int \| None:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self.stats_json.get("lightning", {}).get("rating")`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_bullet(self) -> int \| None:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self.stats_json.get("bullet", {}).get("rating")`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00

			`class Pipeline(BasePipeline):`
			`def get_fetcher(self, session: aiohttp.ClientSession):`
			`return Fetcher(session)`

Fix all mypy warnings. (#9) 2023-12-05 19:54:12 +00:00			`def get_extractor(self, fetcher: BaseFetcher, username: str):`
Scrape content into an asynchronous pipeline. (#8) 2023-12-05 18:43:13 +00:00			`return Extractor(fetcher, username)`