Scrape titles.

main
Joshua Potter 2023-12-06 19:52:40 -07:00
parent 0b9a721368
commit 47e8d245c3
6 changed files with 55 additions and 4 deletions

View File

@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline from app.pipeline import Pipeline as BasePipeline
from app.types import Site from app.types import Site, Title
# The number of coach listing pages we will at most iterate through. This number # The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -156,6 +156,18 @@ class Extractor(BaseExtractor):
return None return None
return src return src
def get_title(self) -> Title | None:
if self.profile_soup is None:
return None
a = self.profile_soup.find("a", class_="profile-card-chesstitle")
if not isinstance(a, Tag):
return None
title = a.get_text().strip()
try:
return Title(title)
except ValueError:
return None
def get_languages(self) -> List[str] | None: def get_languages(self) -> List[str] | None:
# TODO: Extract using huggingface model. # TODO: Extract using huggingface model.
return None return None

View File

@ -4,7 +4,7 @@ from typing import List, Literal
from typing_extensions import TypedDict from typing_extensions import TypedDict
from app.types import Site, code_to_lang from app.types import Site, Title, code_to_lang
SCHEMA_NAME = "coach_scraper" SCHEMA_NAME = "coach_scraper"
MAIN_TABLE_NAME = "export" MAIN_TABLE_NAME = "export"
@ -16,6 +16,7 @@ RowKey = (
| Literal["username"] | Literal["username"]
| Literal["name"] | Literal["name"]
| Literal["image_url"] | Literal["image_url"]
| Literal["title"]
| Literal["languages"] | Literal["languages"]
| Literal["rapid"] | Literal["rapid"]
| Literal["blitz"] | Literal["blitz"]
@ -37,6 +38,8 @@ class Row(TypedDict, total=False):
name: str name: str
# Profile image used on the source site. # Profile image used on the source site.
image_url: str image_url: str
# The FIDE title assigned to the coach on the source siste.
title: Title
# The list of languages the coach is fluent in. # The list of languages the coach is fluent in.
languages: List[str] languages: List[str]
# Rapid rating relative to the site they were sourced from. # Rapid rating relative to the site they were sourced from.
@ -120,6 +123,7 @@ def upsert_row(conn, row: Row):
, username , username
, name , name
, image_url , image_url
, title
, languages , languages
, rapid , rapid
, blitz , blitz
@ -134,12 +138,14 @@ def upsert_row(conn, row: Row):
, %s , %s
, %s , %s
, %s , %s
, %s
) )
ON CONFLICT ON CONFLICT
(site, username) (site, username)
DO UPDATE SET DO UPDATE SET
name = EXCLUDED.name, name = EXCLUDED.name,
image_url = EXCLUDED.image_url, image_url = EXCLUDED.image_url,
title = EXCLUDED.title,
languages = EXCLUDED.languages, languages = EXCLUDED.languages,
rapid = EXCLUDED.rapid, rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz, blitz = EXCLUDED.blitz,
@ -150,6 +156,7 @@ def upsert_row(conn, row: Row):
row["username"], row["username"],
row.get("name"), row.get("name"),
row.get("image_url"), row.get("image_url"),
row["title"].value if "title" in row else None,
row.get("languages", []), row.get("languages", []),
row.get("rapid"), row.get("rapid"),
row.get("blitz"), row.get("blitz"),

View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline from app.pipeline import Pipeline as BasePipeline
from app.types import Site, lang_to_code from app.types import Site, Title, lang_to_code
# The number of pages we will at most iterate through. This number was # The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical # determined by going to https://lichess.org/coach/all/all/alphabetical
@ -103,6 +103,8 @@ def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool: def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
if "user-link" in attrs.get("class", ""):
return True
if "profile-side" in attrs.get("class", ""): if "profile-side" in attrs.get("class", ""):
return True return True
if "sub-ratings" in attrs.get("class", ""): if "sub-ratings" in attrs.get("class", ""):
@ -161,6 +163,18 @@ class Extractor(BaseExtractor):
return None return None
return src return src
def get_title(self) -> Title | None:
if self.stats_soup is None:
return None
utitle = self.stats_soup.find("span", class_="utitle")
if not isinstance(utitle, Tag):
return None
title = utitle.get_text().strip()
try:
return Title(title)
except ValueError:
return None
def get_languages(self) -> List[str] | None: def get_languages(self) -> List[str] | None:
if self.profile_soup is None: if self.profile_soup is None:
return None return None

View File

@ -5,7 +5,7 @@ from typing import Any, List, Tuple
import aiohttp import aiohttp
from app.database import Row, RowKey, upsert_row from app.database import Row, RowKey, upsert_row
from app.types import Site from app.types import Site, Title
class Fetcher: class Fetcher:
@ -104,6 +104,9 @@ class Extractor:
def get_image_url(self) -> str | None: def get_image_url(self) -> str | None:
raise NotImplementedError() raise NotImplementedError()
def get_title(self) -> Title | None:
raise NotImplementedError()
def get_languages(self) -> List[str] | None: def get_languages(self) -> List[str] | None:
raise NotImplementedError() raise NotImplementedError()
@ -125,6 +128,7 @@ class Extractor:
_insert(row, "name", self.get_name()) _insert(row, "name", self.get_name())
_insert(row, "image_url", self.get_image_url()) _insert(row, "image_url", self.get_image_url())
_insert(row, "title", self.get_title())
_insert(row, "languages", self.get_languages()) _insert(row, "languages", self.get_languages())
_insert(row, "rapid", self.get_rapid()) _insert(row, "rapid", self.get_rapid())
_insert(row, "blitz", self.get_blitz()) _insert(row, "blitz", self.get_blitz())

View File

@ -7,6 +7,19 @@ class Site(enum.Enum):
LICHESS = "lichess" LICHESS = "lichess"
class Title(enum.Enum):
GM = "GM" # Grandmaster
IM = "IM" # International master
FM = "FM" # FIDE master
CM = "CM" # Candidate master
NM = "NM" # National master
WGM = "WGM"
WIM = "WIM"
WFM = "WFM"
WCM = "WCM"
WNM = "WNM"
class Language(enum.Enum): class Language(enum.Enum):
en_GB = "English" en_GB = "English"
af_ZA = "Afrikaans" af_ZA = "Afrikaans"

View File

@ -9,6 +9,7 @@ CREATE TABLE coach_scraper.export
, name VARCHAR(255) , name VARCHAR(255)
, image_url TEXT , image_url TEXT
, languages TEXT[] , languages TEXT[]
, title VARCHAR(3)
, rapid INT , rapid INT
, blitz INT , blitz INT
, bullet INT , bullet INT