Scrape titles.
parent
0b9a721368
commit
47e8d245c3
|
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
|||
from app.pipeline import Extractor as BaseExtractor
|
||||
from app.pipeline import Fetcher as BaseFetcher
|
||||
from app.pipeline import Pipeline as BasePipeline
|
||||
from app.types import Site
|
||||
from app.types import Site, Title
|
||||
|
||||
# The number of coach listing pages we will at most iterate through. This number
|
||||
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
||||
|
@ -156,6 +156,18 @@ class Extractor(BaseExtractor):
|
|||
return None
|
||||
return src
|
||||
|
||||
def get_title(self) -> Title | None:
|
||||
if self.profile_soup is None:
|
||||
return None
|
||||
a = self.profile_soup.find("a", class_="profile-card-chesstitle")
|
||||
if not isinstance(a, Tag):
|
||||
return None
|
||||
title = a.get_text().strip()
|
||||
try:
|
||||
return Title(title)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def get_languages(self) -> List[str] | None:
|
||||
# TODO: Extract using huggingface model.
|
||||
return None
|
||||
|
|
|
@ -4,7 +4,7 @@ from typing import List, Literal
|
|||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from app.types import Site, code_to_lang
|
||||
from app.types import Site, Title, code_to_lang
|
||||
|
||||
SCHEMA_NAME = "coach_scraper"
|
||||
MAIN_TABLE_NAME = "export"
|
||||
|
@ -16,6 +16,7 @@ RowKey = (
|
|||
| Literal["username"]
|
||||
| Literal["name"]
|
||||
| Literal["image_url"]
|
||||
| Literal["title"]
|
||||
| Literal["languages"]
|
||||
| Literal["rapid"]
|
||||
| Literal["blitz"]
|
||||
|
@ -37,6 +38,8 @@ class Row(TypedDict, total=False):
|
|||
name: str
|
||||
# Profile image used on the source site.
|
||||
image_url: str
|
||||
# The FIDE title assigned to the coach on the source siste.
|
||||
title: Title
|
||||
# The list of languages the coach is fluent in.
|
||||
languages: List[str]
|
||||
# Rapid rating relative to the site they were sourced from.
|
||||
|
@ -120,6 +123,7 @@ def upsert_row(conn, row: Row):
|
|||
, username
|
||||
, name
|
||||
, image_url
|
||||
, title
|
||||
, languages
|
||||
, rapid
|
||||
, blitz
|
||||
|
@ -134,12 +138,14 @@ def upsert_row(conn, row: Row):
|
|||
, %s
|
||||
, %s
|
||||
, %s
|
||||
, %s
|
||||
)
|
||||
ON CONFLICT
|
||||
(site, username)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
image_url = EXCLUDED.image_url,
|
||||
title = EXCLUDED.title,
|
||||
languages = EXCLUDED.languages,
|
||||
rapid = EXCLUDED.rapid,
|
||||
blitz = EXCLUDED.blitz,
|
||||
|
@ -150,6 +156,7 @@ def upsert_row(conn, row: Row):
|
|||
row["username"],
|
||||
row.get("name"),
|
||||
row.get("image_url"),
|
||||
row["title"].value if "title" in row else None,
|
||||
row.get("languages", []),
|
||||
row.get("rapid"),
|
||||
row.get("blitz"),
|
||||
|
|
|
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
|||
from app.pipeline import Extractor as BaseExtractor
|
||||
from app.pipeline import Fetcher as BaseFetcher
|
||||
from app.pipeline import Pipeline as BasePipeline
|
||||
from app.types import Site, lang_to_code
|
||||
from app.types import Site, Title, lang_to_code
|
||||
|
||||
# The number of pages we will at most iterate through. This number was
|
||||
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||
|
@ -103,6 +103,8 @@ def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
|||
|
||||
|
||||
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||
if "user-link" in attrs.get("class", ""):
|
||||
return True
|
||||
if "profile-side" in attrs.get("class", ""):
|
||||
return True
|
||||
if "sub-ratings" in attrs.get("class", ""):
|
||||
|
@ -161,6 +163,18 @@ class Extractor(BaseExtractor):
|
|||
return None
|
||||
return src
|
||||
|
||||
def get_title(self) -> Title | None:
|
||||
if self.stats_soup is None:
|
||||
return None
|
||||
utitle = self.stats_soup.find("span", class_="utitle")
|
||||
if not isinstance(utitle, Tag):
|
||||
return None
|
||||
title = utitle.get_text().strip()
|
||||
try:
|
||||
return Title(title)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def get_languages(self) -> List[str] | None:
|
||||
if self.profile_soup is None:
|
||||
return None
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Any, List, Tuple
|
|||
import aiohttp
|
||||
|
||||
from app.database import Row, RowKey, upsert_row
|
||||
from app.types import Site
|
||||
from app.types import Site, Title
|
||||
|
||||
|
||||
class Fetcher:
|
||||
|
@ -104,6 +104,9 @@ class Extractor:
|
|||
def get_image_url(self) -> str | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_title(self) -> Title | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_languages(self) -> List[str] | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -125,6 +128,7 @@ class Extractor:
|
|||
|
||||
_insert(row, "name", self.get_name())
|
||||
_insert(row, "image_url", self.get_image_url())
|
||||
_insert(row, "title", self.get_title())
|
||||
_insert(row, "languages", self.get_languages())
|
||||
_insert(row, "rapid", self.get_rapid())
|
||||
_insert(row, "blitz", self.get_blitz())
|
||||
|
|
13
app/types.py
13
app/types.py
|
@ -7,6 +7,19 @@ class Site(enum.Enum):
|
|||
LICHESS = "lichess"
|
||||
|
||||
|
||||
class Title(enum.Enum):
|
||||
GM = "GM" # Grandmaster
|
||||
IM = "IM" # International master
|
||||
FM = "FM" # FIDE master
|
||||
CM = "CM" # Candidate master
|
||||
NM = "NM" # National master
|
||||
WGM = "WGM"
|
||||
WIM = "WIM"
|
||||
WFM = "WFM"
|
||||
WCM = "WCM"
|
||||
WNM = "WNM"
|
||||
|
||||
|
||||
class Language(enum.Enum):
|
||||
en_GB = "English"
|
||||
af_ZA = "Afrikaans"
|
||||
|
|
|
@ -9,6 +9,7 @@ CREATE TABLE coach_scraper.export
|
|||
, name VARCHAR(255)
|
||||
, image_url TEXT
|
||||
, languages TEXT[]
|
||||
, title VARCHAR(3)
|
||||
, rapid INT
|
||||
, blitz INT
|
||||
, bullet INT
|
||||
|
|
Loading…
Reference in New Issue