Scrape titles.
parent
0b9a721368
commit
47e8d245c3
|
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||||
from app.pipeline import Extractor as BaseExtractor
|
from app.pipeline import Extractor as BaseExtractor
|
||||||
from app.pipeline import Fetcher as BaseFetcher
|
from app.pipeline import Fetcher as BaseFetcher
|
||||||
from app.pipeline import Pipeline as BasePipeline
|
from app.pipeline import Pipeline as BasePipeline
|
||||||
from app.types import Site
|
from app.types import Site, Title
|
||||||
|
|
||||||
# The number of coach listing pages we will at most iterate through. This number
|
# The number of coach listing pages we will at most iterate through. This number
|
||||||
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
||||||
|
@ -156,6 +156,18 @@ class Extractor(BaseExtractor):
|
||||||
return None
|
return None
|
||||||
return src
|
return src
|
||||||
|
|
||||||
|
def get_title(self) -> Title | None:
|
||||||
|
if self.profile_soup is None:
|
||||||
|
return None
|
||||||
|
a = self.profile_soup.find("a", class_="profile-card-chesstitle")
|
||||||
|
if not isinstance(a, Tag):
|
||||||
|
return None
|
||||||
|
title = a.get_text().strip()
|
||||||
|
try:
|
||||||
|
return Title(title)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
def get_languages(self) -> List[str] | None:
|
def get_languages(self) -> List[str] | None:
|
||||||
# TODO: Extract using huggingface model.
|
# TODO: Extract using huggingface model.
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -4,7 +4,7 @@ from typing import List, Literal
|
||||||
|
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
from app.types import Site, code_to_lang
|
from app.types import Site, Title, code_to_lang
|
||||||
|
|
||||||
SCHEMA_NAME = "coach_scraper"
|
SCHEMA_NAME = "coach_scraper"
|
||||||
MAIN_TABLE_NAME = "export"
|
MAIN_TABLE_NAME = "export"
|
||||||
|
@ -16,6 +16,7 @@ RowKey = (
|
||||||
| Literal["username"]
|
| Literal["username"]
|
||||||
| Literal["name"]
|
| Literal["name"]
|
||||||
| Literal["image_url"]
|
| Literal["image_url"]
|
||||||
|
| Literal["title"]
|
||||||
| Literal["languages"]
|
| Literal["languages"]
|
||||||
| Literal["rapid"]
|
| Literal["rapid"]
|
||||||
| Literal["blitz"]
|
| Literal["blitz"]
|
||||||
|
@ -37,6 +38,8 @@ class Row(TypedDict, total=False):
|
||||||
name: str
|
name: str
|
||||||
# Profile image used on the source site.
|
# Profile image used on the source site.
|
||||||
image_url: str
|
image_url: str
|
||||||
|
# The FIDE title assigned to the coach on the source siste.
|
||||||
|
title: Title
|
||||||
# The list of languages the coach is fluent in.
|
# The list of languages the coach is fluent in.
|
||||||
languages: List[str]
|
languages: List[str]
|
||||||
# Rapid rating relative to the site they were sourced from.
|
# Rapid rating relative to the site they were sourced from.
|
||||||
|
@ -120,6 +123,7 @@ def upsert_row(conn, row: Row):
|
||||||
, username
|
, username
|
||||||
, name
|
, name
|
||||||
, image_url
|
, image_url
|
||||||
|
, title
|
||||||
, languages
|
, languages
|
||||||
, rapid
|
, rapid
|
||||||
, blitz
|
, blitz
|
||||||
|
@ -134,12 +138,14 @@ def upsert_row(conn, row: Row):
|
||||||
, %s
|
, %s
|
||||||
, %s
|
, %s
|
||||||
, %s
|
, %s
|
||||||
|
, %s
|
||||||
)
|
)
|
||||||
ON CONFLICT
|
ON CONFLICT
|
||||||
(site, username)
|
(site, username)
|
||||||
DO UPDATE SET
|
DO UPDATE SET
|
||||||
name = EXCLUDED.name,
|
name = EXCLUDED.name,
|
||||||
image_url = EXCLUDED.image_url,
|
image_url = EXCLUDED.image_url,
|
||||||
|
title = EXCLUDED.title,
|
||||||
languages = EXCLUDED.languages,
|
languages = EXCLUDED.languages,
|
||||||
rapid = EXCLUDED.rapid,
|
rapid = EXCLUDED.rapid,
|
||||||
blitz = EXCLUDED.blitz,
|
blitz = EXCLUDED.blitz,
|
||||||
|
@ -150,6 +156,7 @@ def upsert_row(conn, row: Row):
|
||||||
row["username"],
|
row["username"],
|
||||||
row.get("name"),
|
row.get("name"),
|
||||||
row.get("image_url"),
|
row.get("image_url"),
|
||||||
|
row["title"].value if "title" in row else None,
|
||||||
row.get("languages", []),
|
row.get("languages", []),
|
||||||
row.get("rapid"),
|
row.get("rapid"),
|
||||||
row.get("blitz"),
|
row.get("blitz"),
|
||||||
|
|
|
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||||
from app.pipeline import Extractor as BaseExtractor
|
from app.pipeline import Extractor as BaseExtractor
|
||||||
from app.pipeline import Fetcher as BaseFetcher
|
from app.pipeline import Fetcher as BaseFetcher
|
||||||
from app.pipeline import Pipeline as BasePipeline
|
from app.pipeline import Pipeline as BasePipeline
|
||||||
from app.types import Site, lang_to_code
|
from app.types import Site, Title, lang_to_code
|
||||||
|
|
||||||
# The number of pages we will at most iterate through. This number was
|
# The number of pages we will at most iterate through. This number was
|
||||||
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||||
|
@ -103,6 +103,8 @@ def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
|
def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||||
|
if "user-link" in attrs.get("class", ""):
|
||||||
|
return True
|
||||||
if "profile-side" in attrs.get("class", ""):
|
if "profile-side" in attrs.get("class", ""):
|
||||||
return True
|
return True
|
||||||
if "sub-ratings" in attrs.get("class", ""):
|
if "sub-ratings" in attrs.get("class", ""):
|
||||||
|
@ -161,6 +163,18 @@ class Extractor(BaseExtractor):
|
||||||
return None
|
return None
|
||||||
return src
|
return src
|
||||||
|
|
||||||
|
def get_title(self) -> Title | None:
|
||||||
|
if self.stats_soup is None:
|
||||||
|
return None
|
||||||
|
utitle = self.stats_soup.find("span", class_="utitle")
|
||||||
|
if not isinstance(utitle, Tag):
|
||||||
|
return None
|
||||||
|
title = utitle.get_text().strip()
|
||||||
|
try:
|
||||||
|
return Title(title)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
def get_languages(self) -> List[str] | None:
|
def get_languages(self) -> List[str] | None:
|
||||||
if self.profile_soup is None:
|
if self.profile_soup is None:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Any, List, Tuple
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
|
||||||
from app.database import Row, RowKey, upsert_row
|
from app.database import Row, RowKey, upsert_row
|
||||||
from app.types import Site
|
from app.types import Site, Title
|
||||||
|
|
||||||
|
|
||||||
class Fetcher:
|
class Fetcher:
|
||||||
|
@ -104,6 +104,9 @@ class Extractor:
|
||||||
def get_image_url(self) -> str | None:
|
def get_image_url(self) -> str | None:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_title(self) -> Title | None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_languages(self) -> List[str] | None:
|
def get_languages(self) -> List[str] | None:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -125,6 +128,7 @@ class Extractor:
|
||||||
|
|
||||||
_insert(row, "name", self.get_name())
|
_insert(row, "name", self.get_name())
|
||||||
_insert(row, "image_url", self.get_image_url())
|
_insert(row, "image_url", self.get_image_url())
|
||||||
|
_insert(row, "title", self.get_title())
|
||||||
_insert(row, "languages", self.get_languages())
|
_insert(row, "languages", self.get_languages())
|
||||||
_insert(row, "rapid", self.get_rapid())
|
_insert(row, "rapid", self.get_rapid())
|
||||||
_insert(row, "blitz", self.get_blitz())
|
_insert(row, "blitz", self.get_blitz())
|
||||||
|
|
13
app/types.py
13
app/types.py
|
@ -7,6 +7,19 @@ class Site(enum.Enum):
|
||||||
LICHESS = "lichess"
|
LICHESS = "lichess"
|
||||||
|
|
||||||
|
|
||||||
|
class Title(enum.Enum):
|
||||||
|
GM = "GM" # Grandmaster
|
||||||
|
IM = "IM" # International master
|
||||||
|
FM = "FM" # FIDE master
|
||||||
|
CM = "CM" # Candidate master
|
||||||
|
NM = "NM" # National master
|
||||||
|
WGM = "WGM"
|
||||||
|
WIM = "WIM"
|
||||||
|
WFM = "WFM"
|
||||||
|
WCM = "WCM"
|
||||||
|
WNM = "WNM"
|
||||||
|
|
||||||
|
|
||||||
class Language(enum.Enum):
|
class Language(enum.Enum):
|
||||||
en_GB = "English"
|
en_GB = "English"
|
||||||
af_ZA = "Afrikaans"
|
af_ZA = "Afrikaans"
|
||||||
|
|
|
@ -9,6 +9,7 @@ CREATE TABLE coach_scraper.export
|
||||||
, name VARCHAR(255)
|
, name VARCHAR(255)
|
||||||
, image_url TEXT
|
, image_url TEXT
|
||||||
, languages TEXT[]
|
, languages TEXT[]
|
||||||
|
, title VARCHAR(3)
|
||||||
, rapid INT
|
, rapid INT
|
||||||
, blitz INT
|
, blitz INT
|
||||||
, bullet INT
|
, bullet INT
|
||||||
|
|
Loading…
Reference in New Issue