Scrape titles.

2023-12-06 19:52:40 -07:00 · 2023-12-06 19:52:40 -07:00 · 47e8d245c3
parent 0b9a721368
commit 47e8d245c3
6 changed files with 55 additions and 4 deletions
--- a/app/chesscom.py
+++ b/app/chesscom.py
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.types import Site
+from app.types import Site, Title

 # The number of coach listing pages we will at most iterate through. This number
 # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -156,6 +156,18 @@ class Extractor(BaseExtractor):
            return None
        return src

+    def get_title(self) -> Title | None:
+        if self.profile_soup is None:
+            return None
+        a = self.profile_soup.find("a", class_="profile-card-chesstitle")
+        if not isinstance(a, Tag):
+            return None
+        title = a.get_text().strip()
+        try:
+            return Title(title)
+        except ValueError:
+            return None
+
    def get_languages(self) -> List[str] | None:
        # TODO: Extract using huggingface model.
        return None
--- a/app/database.py
+++ b/app/database.py
@ -4,7 +4,7 @@ from typing import List, Literal

 from typing_extensions import TypedDict

-from app.types import Site, code_to_lang
+from app.types import Site, Title, code_to_lang

 SCHEMA_NAME = "coach_scraper"
 MAIN_TABLE_NAME = "export"
@ -16,6 +16,7 @@ RowKey = (
    | Literal["username"]
    | Literal["name"]
    | Literal["image_url"]
+    | Literal["title"]
    | Literal["languages"]
    | Literal["rapid"]
    | Literal["blitz"]
@ -37,6 +38,8 @@ class Row(TypedDict, total=False):
    name: str
    # Profile image used on the source site.
    image_url: str
+    # The FIDE title assigned to the coach on the source siste.
+    title: Title
    # The list of languages the coach is fluent in.
    languages: List[str]
    # Rapid rating relative to the site they were sourced from.
@ -120,6 +123,7 @@ def upsert_row(conn, row: Row):
              , username
              , name
              , image_url
+              , title
              , languages
              , rapid
              , blitz
@ -134,12 +138,14 @@ def upsert_row(conn, row: Row):
              , %s
              , %s
              , %s
+              , %s
              )
            ON CONFLICT
              (site, username)
            DO UPDATE SET
              name = EXCLUDED.name,
              image_url = EXCLUDED.image_url,
+              title = EXCLUDED.title,
              languages = EXCLUDED.languages,
              rapid = EXCLUDED.rapid,
              blitz = EXCLUDED.blitz,
@ -150,6 +156,7 @@ def upsert_row(conn, row: Row):
                row["username"],
                row.get("name"),
                row.get("image_url"),
+                row["title"].value if "title" in row else None,
                row.get("languages", []),
                row.get("rapid"),
                row.get("blitz"),
--- a/app/lichess.py
+++ b/app/lichess.py
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.types import Site, lang_to_code
+from app.types import Site, Title, lang_to_code

 # The number of pages we will at most iterate through. This number was
 # determined by going to https://lichess.org/coach/all/all/alphabetical
@ -103,6 +103,8 @@ def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:


 def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
+    if "user-link" in attrs.get("class", ""):
+        return True
    if "profile-side" in attrs.get("class", ""):
        return True
    if "sub-ratings" in attrs.get("class", ""):
@ -161,6 +163,18 @@ class Extractor(BaseExtractor):
            return None
        return src

+    def get_title(self) -> Title | None:
+        if self.stats_soup is None:
+            return None
+        utitle = self.stats_soup.find("span", class_="utitle")
+        if not isinstance(utitle, Tag):
+            return None
+        title = utitle.get_text().strip()
+        try:
+            return Title(title)
+        except ValueError:
+            return None
+
    def get_languages(self) -> List[str] | None:
        if self.profile_soup is None:
            return None
--- a/app/pipeline.py
+++ b/app/pipeline.py
@ -5,7 +5,7 @@ from typing import Any, List, Tuple
 import aiohttp

 from app.database import Row, RowKey, upsert_row
-from app.types import Site
+from app.types import Site, Title


 class Fetcher:
@ -104,6 +104,9 @@ class Extractor:
    def get_image_url(self) -> str | None:
        raise NotImplementedError()

+    def get_title(self) -> Title | None:
+        raise NotImplementedError()
+
    def get_languages(self) -> List[str] | None:
        raise NotImplementedError()

@ -125,6 +128,7 @@ class Extractor:

        _insert(row, "name", self.get_name())
        _insert(row, "image_url", self.get_image_url())
+        _insert(row, "title", self.get_title())
        _insert(row, "languages", self.get_languages())
        _insert(row, "rapid", self.get_rapid())
        _insert(row, "blitz", self.get_blitz())
--- a/app/types.py
+++ b/app/types.py
@ -7,6 +7,19 @@ class Site(enum.Enum):
    LICHESS = "lichess"


+class Title(enum.Enum):
+    GM = "GM"  # Grandmaster
+    IM = "IM"  # International master
+    FM = "FM"  # FIDE master
+    CM = "CM"  # Candidate master
+    NM = "NM"  # National master
+    WGM = "WGM"
+    WIM = "WIM"
+    WFM = "WFM"
+    WCM = "WCM"
+    WNM = "WNM"
+
+
 class Language(enum.Enum):
    en_GB = "English"
    af_ZA = "Afrikaans"
--- a/sql/init.sql
+++ b/sql/init.sql
@ -9,6 +9,7 @@ CREATE TABLE coach_scraper.export
  , name VARCHAR(255)
  , image_url TEXT
  , languages TEXT[]
+  , title VARCHAR(3)
  , rapid INT
  , blitz INT
  , bullet INT