coach-scraper/app/lichess.py

import asyncio
import os
import os.path
from typing import List, Union

import aiohttp
from bs4 import BeautifulSoup, SoupStrainer

from app.exporter import BaseExporter
from app.repo import AnsiColor, Site
from app.scraper import BaseScraper

# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
# and traversing to the last page.
MAX_PAGES = 162

# How long to wait between each network request.
SLEEP_SECS = 5


class Scraper(BaseScraper):
    def __init__(self, session: aiohttp.ClientSession):
        super().__init__(site=Site.LICHESS.value, session=session)

    async def download_usernames(self) -> List[str]:
        """Scan through lichess.org/coach for all coaches' usernames.

        @return
            The complete list of scraped usernames across every coach listing
            page.
        """
        usernames = []
        for page_no in range(1, MAX_PAGES + 1):
            filepath = self.path_page_file(page_no)
            try:
                with open(filepath, "r") as f:
                    self.log(
                        [
                            (AnsiColor.INFO, "[INFO]"),
                            (None, ": Reading file "),
                            (AnsiColor.DATA, filepath),
                        ]
                    )
                    usernames.extend([line.strip() for line in f.readlines()])
            except FileNotFoundError:
                page_usernames = await self._scrape_page(page_no)
                if not page_usernames:
                    self.log(
                        [
                            (AnsiColor.ERROR, "[ERROR]"),
                            (None, ": Could not scrape page "),
                            (AnsiColor.DATA, str(page_no)),
                        ]
                    )
                    continue
                with open(filepath, "w") as f:
                    for username in page_usernames:
                        f.write(f"{username}\n")
                usernames.extend(page_usernames)
                self.log(
                    [
                        (AnsiColor.INFO, "[INFO]"),
                        (None, ": Downloaded page "),
                        (AnsiColor.DATA, filepath),
                    ]
                )
                await asyncio.sleep(SLEEP_SECS)

        return usernames

    async def _scrape_page(self, page_no: int):
        """Scan through lichess.org/coach/.../?page=<n> for all coaches'
        usernames.

        @param page_no
            The page consisting of at most 10 coaches (at the time of writing)
            whose usernames are to be scraped.
        @return
            The list of scraped usernames on the specified coach listing page.
        """
        url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
        response, status_code = await self.request(url)
        if response is None:
            self.log(
                [
                    (AnsiColor.ERROR, "[ERROR]"),
                    (None, ": Received status "),
                    (AnsiColor.DATA, f"{status_code} "),
                    (None, "when downloading page "),
                    (AnsiColor.DATA, str(page_no)),
                ]
            )
            return

        usernames = []
        soup = BeautifulSoup(response, "lxml")
        members = soup.find_all("article", class_="coach-widget")
        for member in members:
            a = member.find("a", class_="overlay")
            if a:
                href = a.get("href")
                username = href[len("/coach/") :]
                usernames.append(username)

        return usernames

    async def download_profile(self, username: str):
        """For each coach, download coach-specific data.

        @param username
            The coach username corresponding to the downloaded files.
        """
        used_network1 = await self._download_profile_file(
            url=f"https://lichess.org/coach/{username}",
            username=username,
            filename=self.path_coach_file(username, f"{username}.html"),
        )
        used_network2 = await self._download_profile_file(
            url=f"https://lichess.org/@/{username}",
            username=username,
            filename=self.path_coach_file(username, "stats.html"),
        )

        if any([used_network1, used_network2]):
            self.log(
                [
                    (AnsiColor.INFO, "[INFO]"),
                    (None, ": Downloaded data for coach "),
                    (AnsiColor.DATA, username),
                ]
            )
            await asyncio.sleep(SLEEP_SECS)
        else:
            self.log(
                [
                    (AnsiColor.INFO, "[INFO]"),
                    (None, ": Skipping download for coach "),
                    (AnsiColor.DATA, username),
                ]
            )

    async def _download_profile_file(self, url: str, username: str, filename: str):
        """Writes the contents of url into the specified file.

        @param url
            The URL of the file to download.
        @param username
            The coach username corresponding to the downloaded file.
        @param filename
            The output file to write the downloaded content to.
        @return:
            True if we make a network request. False otherwise.
        """
        if os.path.isfile(filename):
            return False

        response, _unused_status = await self.request(url)
        if response is not None:
            with open(filename, "w") as f:
                f.write(response)

        return True


def _profile_filter(elem, attrs):
    """Includes only relevant segments of the `{username}.html` file."""
    if "coach-widget" in attrs.get("class", ""):
        return True


def _stats_filter(elem, attrs):
    """Includes only relevant segments of the `stats.html` file."""
    if "profile-side" in attrs.get("class", ""):
        return True
    if "sub-ratings" in attrs.get("class", ""):
        return True


class Exporter(BaseExporter):
    def __init__(self, username: str):
        super().__init__(site=Site.LICHESS.value, username=username)

        self.profile_soup = None
        try:
            with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
                self.profile_soup = BeautifulSoup(
                    f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
                )
        except FileNotFoundError:
            pass

        self.stats_soup = None
        try:
            with open(self.path_coach_file(username, "stats.html"), "r") as f:
                self.stats_soup = BeautifulSoup(
                    f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
                )
        except FileNotFoundError:
            pass

    def export_name(self) -> Union[str, None]:
        try:
            profile_side = self.stats_soup.find("div", class_="profile-side")
            user_infos = profile_side.find("div", class_="user-infos")
            name = user_infos.find("strong", class_="name")
            return name.get_text().strip()
        except AttributeError:
            return None

    def export_image_url(self) -> Union[str, None]:
        try:
            picture = self.profile_soup.find("img", class_="picture")
            src = picture.get("src", "")
            if "image.lichess1.org" in src:
                return src
        except AttributeError:
            return None

    def export_rapid(self) -> Union[int, None]:
        return self._find_rating("rapid")

    def export_blitz(self) -> Union[int, None]:
        return self._find_rating("blitz")

    def export_bullet(self) -> Union[int, None]:
        return self._find_rating("bullet")

    def _find_rating(self, name) -> Union[int, None]:
        try:
            a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
            rating = a.find("rating")
            strong = rating.find("strong")
            value = strong.get_text()
            if value[-1] == "?":
                value = value[:-1]
            return int(value)
        except (AttributeError, ValueError):
            return None
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00			`import asyncio`
			`import os`
			`import os.path`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`from typing import List, Union`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
Apply pyls-isort. 2023-12-01 23:36:53 +00:00			`import aiohttp`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`from bs4 import BeautifulSoup, SoupStrainer`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
Apply pyls-isort. 2023-12-01 23:36:53 +00:00			`from app.exporter import BaseExporter`
			`from app.repo import AnsiColor, Site`
			`from app.scraper import BaseScraper`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
			`# The number of pages we will at most iterate through. This number was`
			`# determined by going to https://lichess.org/coach/all/all/alphabetical`
			`# and traversing to the last page.`
			`MAX_PAGES = 162`

			`# How long to wait between each network request.`
			`SLEEP_SECS = 5`


			`class Scraper(BaseScraper):`
			`def __init__(self, session: aiohttp.ClientSession):`
			`super().__init__(site=Site.LICHESS.value, session=session)`

			`async def download_usernames(self) -> List[str]:`
			`"""Scan through lichess.org/coach for all coaches' usernames.`

			`@return`
			`The complete list of scraped usernames across every coach listing`
			`page.`
			`"""`
			`usernames = []`
			`for page_no in range(1, MAX_PAGES + 1):`
			`filepath = self.path_page_file(page_no)`
			`try:`
			`with open(filepath, "r") as f:`
			`self.log(`
			`[`
			`(AnsiColor.INFO, "[INFO]"),`
			`(None, ": Reading file "),`
			`(AnsiColor.DATA, filepath),`
			`]`
			`)`
			`usernames.extend([line.strip() for line in f.readlines()])`
			`except FileNotFoundError:`
			`page_usernames = await self._scrape_page(page_no)`
			`if not page_usernames:`
			`self.log(`
			`[`
			`(AnsiColor.ERROR, "[ERROR]"),`
			`(None, ": Could not scrape page "),`
			`(AnsiColor.DATA, str(page_no)),`
			`]`
			`)`
			`continue`
			`with open(filepath, "w") as f:`
			`for username in page_usernames:`
			`f.write(f"{username}\n")`
			`usernames.extend(page_usernames)`
			`self.log(`
			`[`
			`(AnsiColor.INFO, "[INFO]"),`
			`(None, ": Downloaded page "),`
			`(AnsiColor.DATA, filepath),`
			`]`
			`)`
			`await asyncio.sleep(SLEEP_SECS)`

			`return usernames`

			`async def _scrape_page(self, page_no: int):`
			`"""Scan through lichess.org/coach/.../?page=<n> for all coaches'`
			`usernames.`

			`@param page_no`
			`The page consisting of at most 10 coaches (at the time of writing)`
			`whose usernames are to be scraped.`
			`@return`
			`The list of scraped usernames on the specified coach listing page.`
			`"""`
			`url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"`
			`response, status_code = await self.request(url)`
			`if response is None:`
			`self.log(`
			`[`
			`(AnsiColor.ERROR, "[ERROR]"),`
			`(None, ": Received status "),`
			`(AnsiColor.DATA, f"{status_code} "),`
			`(None, "when downloading page "),`
			`(AnsiColor.DATA, str(page_no)),`
			`]`
			`)`
			`return`

			`usernames = []`
Use lxml to speed up parsing. 2023-12-01 14:12:40 +00:00			`soup = BeautifulSoup(response, "lxml")`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00			`members = soup.find_all("article", class_="coach-widget")`
			`for member in members:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`a = member.find("a", class_="overlay")`
			`if a:`
			`href = a.get("href")`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00			`username = href[len("/coach/") :]`
			`usernames.append(username)`

			`return usernames`

			`async def download_profile(self, username: str):`
			`"""For each coach, download coach-specific data.`

			`@param username`
			`The coach username corresponding to the downloaded files.`
			`"""`
Export rapid ratings. 2023-12-01 03:35:20 +00:00			`used_network1 = await self._download_profile_file(`
			`url=f"https://lichess.org/coach/{username}",`
			`username=username,`
			`filename=self.path_coach_file(username, f"{username}.html"),`
			`)`
			`used_network2 = await self._download_profile_file(`
			`url=f"https://lichess.org/@/{username}",`
			`username=username,`
			`filename=self.path_coach_file(username, "stats.html"),`
			`)`

			`if any([used_network1, used_network2]):`
			`self.log(`
			`[`
			`(AnsiColor.INFO, "[INFO]"),`
			`(None, ": Downloaded data for coach "),`
			`(AnsiColor.DATA, username),`
			`]`
			`)`
			`await asyncio.sleep(SLEEP_SECS)`
			`else:`
Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`self.log(`
			`[`
			`(AnsiColor.INFO, "[INFO]"),`
			`(None, ": Skipping download for coach "),`
			`(AnsiColor.DATA, username),`
			`]`
			`)`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
Export rapid ratings. 2023-12-01 03:35:20 +00:00			`async def _download_profile_file(self, url: str, username: str, filename: str):`
			`"""Writes the contents of url into the specified file.`

			`@param url`
			`The URL of the file to download.`
			`@param username`
			`The coach username corresponding to the downloaded file.`
			`@param filename`
			`The output file to write the downloaded content to.`
			`@return:`
			`True if we make a network request. False otherwise.`
			`"""`
			`if os.path.isfile(filename):`
			`return False`

			`response, _unused_status = await self.request(url)`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00			`if response is not None:`
Export rapid ratings. 2023-12-01 03:35:20 +00:00			`with open(filename, "w") as f:`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00			`f.write(response)`

Export rapid ratings. 2023-12-01 03:35:20 +00:00			`return True`
Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00

Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`def _profile_filter(elem, attrs):`
			"""Includes only relevant segments of the `{username}.html` file."""
			`if "coach-widget" in attrs.get("class", ""):`
			`return True`


Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`def _stats_filter(elem, attrs):`
			"""Includes only relevant segments of the `stats.html` file."""
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`if "profile-side" in attrs.get("class", ""):`
			`return True`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`if "sub-ratings" in attrs.get("class", ""):`
			`return True`


Add class for wrapping around exports. 2023-12-01 00:12:16 +00:00			`class Exporter(BaseExporter):`
			`def __init__(self, username: str):`
			`super().__init__(site=Site.LICHESS.value, username=username)`
Add a scraper for lichess. (#2) 2023-11-30 22:36:44 +00:00
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`self.profile_soup = None`
			`try:`
			`with open(self.path_coach_file(username, f"{username}.html"), "r") as f:`
			`self.profile_soup = BeautifulSoup(`
			`f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)`
			`)`
			`except FileNotFoundError:`
			`pass`

Export rapid ratings. 2023-12-01 03:35:20 +00:00			`self.stats_soup = None`
			`try:`
			`with open(self.path_coach_file(username, "stats.html"), "r") as f:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`self.stats_soup = BeautifulSoup(`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`)`
Export rapid ratings. 2023-12-01 03:35:20 +00:00			`except FileNotFoundError:`
			`pass`

Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`def export_name(self) -> Union[str, None]:`
			`try:`
			`profile_side = self.stats_soup.find("div", class_="profile-side")`
			`user_infos = profile_side.find("div", class_="user-infos")`
			`name = user_infos.find("strong", class_="name")`
			`return name.get_text().strip()`
			`except AttributeError:`
			`return None`

			`def export_image_url(self) -> Union[str, None]:`
			`try:`
			`picture = self.profile_soup.find("img", class_="picture")`
			`src = picture.get("src", "")`
			`if "image.lichess1.org" in src:`
			`return src`
			`except AttributeError:`
			`return None`

			`def export_rapid(self) -> Union[int, None]:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self._find_rating("rapid")`

Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`def export_blitz(self) -> Union[int, None]:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self._find_rating("blitz")`

Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`def export_bullet(self) -> Union[int, None]:`
Export blitz and bullet ratings. 2023-12-01 14:10:58 +00:00			`return self._find_rating("bullet")`

Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`def _find_rating(self, name) -> Union[int, None]:`
Rewrite export as NDJSON and include script to load result into postgres. (#3) * Allow loading exported data into database. * Explanation on E2E. 2023-12-01 17:30:44 +00:00			`try:`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")`
			`rating = a.find("rating")`
			`strong = rating.find("strong")`
			`value = strong.get_text()`
			`if value[-1] == "?":`
			`value = value[:-1]`
Rewrite export as NDJSON and include script to load result into postgres. (#3) * Allow loading exported data into database. * Explanation on E2E. 2023-12-01 17:30:44 +00:00			`return int(value)`
Transition to a CSV; Postgres can handle that better. 2023-12-04 22:08:17 +00:00			`except (AttributeError, ValueError):`
Rewrite export as NDJSON and include script to load result into postgres. (#3) * Allow loading exported data into database. * Explanation on E2E. 2023-12-01 17:30:44 +00:00			`return None`