Scrape titles.

main
Joshua Potter 2023-12-06 19:52:40 -07:00
parent 0b9a721368
commit 47e8d245c3
6 changed files with 55 additions and 4 deletions

View File

@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.types import Site
from app.types import Site, Title
# The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -156,6 +156,18 @@ class Extractor(BaseExtractor):
return None
return src
def get_title(self) -> Title | None:
if self.profile_soup is None:
return None
a = self.profile_soup.find("a", class_="profile-card-chesstitle")
if not isinstance(a, Tag):
return None
title = a.get_text().strip()
try:
return Title(title)
except ValueError:
return None
def get_languages(self) -> List[str] | None:
# TODO: Extract using huggingface model.
return None

View File

@ -4,7 +4,7 @@ from typing import List, Literal
from typing_extensions import TypedDict
from app.types import Site, code_to_lang
from app.types import Site, Title, code_to_lang
SCHEMA_NAME = "coach_scraper"
MAIN_TABLE_NAME = "export"
@ -16,6 +16,7 @@ RowKey = (
| Literal["username"]
| Literal["name"]
| Literal["image_url"]
| Literal["title"]
| Literal["languages"]
| Literal["rapid"]
| Literal["blitz"]
@ -37,6 +38,8 @@ class Row(TypedDict, total=False):
name: str
# Profile image used on the source site.
image_url: str
# The FIDE title assigned to the coach on the source siste.
title: Title
# The list of languages the coach is fluent in.
languages: List[str]
# Rapid rating relative to the site they were sourced from.
@ -120,6 +123,7 @@ def upsert_row(conn, row: Row):
, username
, name
, image_url
, title
, languages
, rapid
, blitz
@ -134,12 +138,14 @@ def upsert_row(conn, row: Row):
, %s
, %s
, %s
, %s
)
ON CONFLICT
(site, username)
DO UPDATE SET
name = EXCLUDED.name,
image_url = EXCLUDED.image_url,
title = EXCLUDED.title,
languages = EXCLUDED.languages,
rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz,
@ -150,6 +156,7 @@ def upsert_row(conn, row: Row):
row["username"],
row.get("name"),
row.get("image_url"),
row["title"].value if "title" in row else None,
row.get("languages", []),
row.get("rapid"),
row.get("blitz"),

View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.types import Site, lang_to_code
from app.types import Site, Title, lang_to_code
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
@ -103,6 +103,8 @@ def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
if "user-link" in attrs.get("class", ""):
return True
if "profile-side" in attrs.get("class", ""):
return True
if "sub-ratings" in attrs.get("class", ""):
@ -161,6 +163,18 @@ class Extractor(BaseExtractor):
return None
return src
def get_title(self) -> Title | None:
if self.stats_soup is None:
return None
utitle = self.stats_soup.find("span", class_="utitle")
if not isinstance(utitle, Tag):
return None
title = utitle.get_text().strip()
try:
return Title(title)
except ValueError:
return None
def get_languages(self) -> List[str] | None:
if self.profile_soup is None:
return None

View File

@ -5,7 +5,7 @@ from typing import Any, List, Tuple
import aiohttp
from app.database import Row, RowKey, upsert_row
from app.types import Site
from app.types import Site, Title
class Fetcher:
@ -104,6 +104,9 @@ class Extractor:
def get_image_url(self) -> str | None:
raise NotImplementedError()
def get_title(self) -> Title | None:
raise NotImplementedError()
def get_languages(self) -> List[str] | None:
raise NotImplementedError()
@ -125,6 +128,7 @@ class Extractor:
_insert(row, "name", self.get_name())
_insert(row, "image_url", self.get_image_url())
_insert(row, "title", self.get_title())
_insert(row, "languages", self.get_languages())
_insert(row, "rapid", self.get_rapid())
_insert(row, "blitz", self.get_blitz())

View File

@ -7,6 +7,19 @@ class Site(enum.Enum):
LICHESS = "lichess"
class Title(enum.Enum):
GM = "GM" # Grandmaster
IM = "IM" # International master
FM = "FM" # FIDE master
CM = "CM" # Candidate master
NM = "NM" # National master
WGM = "WGM"
WIM = "WIM"
WFM = "WFM"
WCM = "WCM"
WNM = "WNM"
class Language(enum.Enum):
en_GB = "English"
af_ZA = "Afrikaans"

View File

@ -9,6 +9,7 @@ CREATE TABLE coach_scraper.export
, name VARCHAR(255)
, image_url TEXT
, languages TEXT[]
, title VARCHAR(3)
, rapid INT
, blitz INT
, bullet INT