diff --git a/app/chesscom.py b/app/chesscom.py index ea0ef3a..0b0f4b0 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.types import Site +from app.types import Site, Title # The number of coach listing pages we will at most iterate through. This number # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and @@ -156,6 +156,18 @@ class Extractor(BaseExtractor): return None return src + def get_title(self) -> Title | None: + if self.profile_soup is None: + return None + a = self.profile_soup.find("a", class_="profile-card-chesstitle") + if not isinstance(a, Tag): + return None + title = a.get_text().strip() + try: + return Title(title) + except ValueError: + return None + def get_languages(self) -> List[str] | None: # TODO: Extract using huggingface model. return None diff --git a/app/database.py b/app/database.py index 38b2e39..b3efcf0 100644 --- a/app/database.py +++ b/app/database.py @@ -4,7 +4,7 @@ from typing import List, Literal from typing_extensions import TypedDict -from app.types import Site, code_to_lang +from app.types import Site, Title, code_to_lang SCHEMA_NAME = "coach_scraper" MAIN_TABLE_NAME = "export" @@ -16,6 +16,7 @@ RowKey = ( | Literal["username"] | Literal["name"] | Literal["image_url"] + | Literal["title"] | Literal["languages"] | Literal["rapid"] | Literal["blitz"] @@ -37,6 +38,8 @@ class Row(TypedDict, total=False): name: str # Profile image used on the source site. image_url: str + # The FIDE title assigned to the coach on the source siste. + title: Title # The list of languages the coach is fluent in. languages: List[str] # Rapid rating relative to the site they were sourced from. @@ -120,6 +123,7 @@ def upsert_row(conn, row: Row): , username , name , image_url + , title , languages , rapid , blitz @@ -134,12 +138,14 @@ def upsert_row(conn, row: Row): , %s , %s , %s + , %s ) ON CONFLICT (site, username) DO UPDATE SET name = EXCLUDED.name, image_url = EXCLUDED.image_url, + title = EXCLUDED.title, languages = EXCLUDED.languages, rapid = EXCLUDED.rapid, blitz = EXCLUDED.blitz, @@ -150,6 +156,7 @@ def upsert_row(conn, row: Row): row["username"], row.get("name"), row.get("image_url"), + row["title"].value if "title" in row else None, row.get("languages", []), row.get("rapid"), row.get("blitz"), diff --git a/app/lichess.py b/app/lichess.py index e830ee8..d151793 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.types import Site, lang_to_code +from app.types import Site, Title, lang_to_code # The number of pages we will at most iterate through. This number was # determined by going to https://lichess.org/coach/all/all/alphabetical @@ -103,6 +103,8 @@ def _profile_filter(elem: Tag | str | None, attrs={}) -> bool: def _stats_filter(elem: Tag | str | None, attrs={}) -> bool: + if "user-link" in attrs.get("class", ""): + return True if "profile-side" in attrs.get("class", ""): return True if "sub-ratings" in attrs.get("class", ""): @@ -161,6 +163,18 @@ class Extractor(BaseExtractor): return None return src + def get_title(self) -> Title | None: + if self.stats_soup is None: + return None + utitle = self.stats_soup.find("span", class_="utitle") + if not isinstance(utitle, Tag): + return None + title = utitle.get_text().strip() + try: + return Title(title) + except ValueError: + return None + def get_languages(self) -> List[str] | None: if self.profile_soup is None: return None diff --git a/app/pipeline.py b/app/pipeline.py index 4292ee8..96ba96f 100644 --- a/app/pipeline.py +++ b/app/pipeline.py @@ -5,7 +5,7 @@ from typing import Any, List, Tuple import aiohttp from app.database import Row, RowKey, upsert_row -from app.types import Site +from app.types import Site, Title class Fetcher: @@ -104,6 +104,9 @@ class Extractor: def get_image_url(self) -> str | None: raise NotImplementedError() + def get_title(self) -> Title | None: + raise NotImplementedError() + def get_languages(self) -> List[str] | None: raise NotImplementedError() @@ -125,6 +128,7 @@ class Extractor: _insert(row, "name", self.get_name()) _insert(row, "image_url", self.get_image_url()) + _insert(row, "title", self.get_title()) _insert(row, "languages", self.get_languages()) _insert(row, "rapid", self.get_rapid()) _insert(row, "blitz", self.get_blitz()) diff --git a/app/types.py b/app/types.py index b19b76c..a53797b 100644 --- a/app/types.py +++ b/app/types.py @@ -7,6 +7,19 @@ class Site(enum.Enum): LICHESS = "lichess" +class Title(enum.Enum): + GM = "GM" # Grandmaster + IM = "IM" # International master + FM = "FM" # FIDE master + CM = "CM" # Candidate master + NM = "NM" # National master + WGM = "WGM" + WIM = "WIM" + WFM = "WFM" + WCM = "WCM" + WNM = "WNM" + + class Language(enum.Enum): en_GB = "English" af_ZA = "Afrikaans" diff --git a/sql/init.sql b/sql/init.sql index 7c79e23..b3e953e 100644 --- a/sql/init.sql +++ b/sql/init.sql @@ -9,6 +9,7 @@ CREATE TABLE coach_scraper.export , name VARCHAR(255) , image_url TEXT , languages TEXT[] + , title VARCHAR(3) , rapid INT , blitz INT , bullet INT