Scrape languages from lichess listing. (#10)
parent
82dbef21b6
commit
ef5d296097
|
@ -8,7 +8,7 @@ import psycopg2
|
|||
from app.chesscom import Pipeline as ChesscomPipeline
|
||||
from app.database import backup_database
|
||||
from app.lichess import Pipeline as LichessPipeline
|
||||
from app.site import Site
|
||||
from app.types import Site
|
||||
|
||||
# The number of parallel extraction jobs that are run at a time.
|
||||
WORKER_COUNT = 10
|
||||
|
|
|
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
|||
from app.pipeline import Extractor as BaseExtractor
|
||||
from app.pipeline import Fetcher as BaseFetcher
|
||||
from app.pipeline import Pipeline as BasePipeline
|
||||
from app.site import Site
|
||||
from app.types import Site
|
||||
|
||||
# The number of coach listing pages we will at most iterate through. This number
|
||||
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
||||
|
@ -156,6 +156,10 @@ class Extractor(BaseExtractor):
|
|||
return None
|
||||
return src
|
||||
|
||||
def get_languages(self) -> List[str] | None:
|
||||
# TODO: Extract using huggingface model.
|
||||
return None
|
||||
|
||||
def get_rapid(self) -> int | None:
|
||||
return self.stats_json.get("rapid", {}).get("rating")
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Literal
|
||||
from typing import List, Literal
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from app.site import Site
|
||||
from app.types import Site
|
||||
|
||||
SCHEMA_NAME = "coach_scraper"
|
||||
TABLE_NAME = "export"
|
||||
|
@ -15,6 +15,7 @@ RowKey = (
|
|||
| Literal["username"]
|
||||
| Literal["name"]
|
||||
| Literal["image_url"]
|
||||
| Literal["languages"]
|
||||
| Literal["rapid"]
|
||||
| Literal["blitz"]
|
||||
| Literal["bullet"]
|
||||
|
@ -35,6 +36,8 @@ class Row(TypedDict, total=False):
|
|||
name: str
|
||||
# Profile image used on the source site.
|
||||
image_url: str
|
||||
# The list of languages the coach is fluent in.
|
||||
languages: List[str]
|
||||
# Rapid rating relative to the site they were sourced from.
|
||||
rapid: int
|
||||
# Blitz rating relative to the site they were sourced from.
|
||||
|
@ -90,6 +93,7 @@ def upsert_row(conn, row: Row):
|
|||
, username
|
||||
, name
|
||||
, image_url
|
||||
, languages
|
||||
, rapid
|
||||
, blitz
|
||||
, bullet
|
||||
|
@ -102,12 +106,14 @@ def upsert_row(conn, row: Row):
|
|||
, %s
|
||||
, %s
|
||||
, %s
|
||||
, %s
|
||||
)
|
||||
ON CONFLICT
|
||||
(site, username)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
image_url = EXCLUDED.image_url,
|
||||
languages = EXCLUDED.languages,
|
||||
rapid = EXCLUDED.rapid,
|
||||
blitz = EXCLUDED.blitz,
|
||||
bullet = EXCLUDED.bullet;
|
||||
|
@ -117,6 +123,7 @@ def upsert_row(conn, row: Row):
|
|||
row["username"],
|
||||
row.get("name"),
|
||||
row.get("image_url"),
|
||||
row.get("languages", []),
|
||||
row.get("rapid"),
|
||||
row.get("blitz"),
|
||||
row.get("bullet"),
|
||||
|
|
|
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
|||
from app.pipeline import Extractor as BaseExtractor
|
||||
from app.pipeline import Fetcher as BaseFetcher
|
||||
from app.pipeline import Pipeline as BasePipeline
|
||||
from app.site import Site
|
||||
from app.types import Site, lang_to_code
|
||||
|
||||
# The number of pages we will at most iterate through. This number was
|
||||
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||
|
@ -161,6 +161,22 @@ class Extractor(BaseExtractor):
|
|||
return None
|
||||
return src
|
||||
|
||||
def get_languages(self) -> List[str] | None:
|
||||
if self.profile_soup is None:
|
||||
return None
|
||||
tr = self.profile_soup.find("tr", class_="languages")
|
||||
if not isinstance(tr, Tag):
|
||||
return None
|
||||
td = tr.find("td")
|
||||
if not isinstance(td, Tag):
|
||||
return None
|
||||
|
||||
codes = []
|
||||
for lang in [s.strip() for s in tr.get_text().split(",")]:
|
||||
if lang in lang_to_code:
|
||||
codes.append(lang_to_code[lang])
|
||||
return codes
|
||||
|
||||
def get_rapid(self) -> int | None:
|
||||
return self._find_rating("rapid")
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Any, List, Tuple
|
|||
import aiohttp
|
||||
|
||||
from app.database import Row, RowKey, upsert_row
|
||||
from app.site import Site
|
||||
from app.types import Site
|
||||
|
||||
|
||||
class Fetcher:
|
||||
|
@ -104,6 +104,9 @@ class Extractor:
|
|||
def get_image_url(self) -> str | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_languages(self) -> List[str] | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_rapid(self) -> int | None:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -122,6 +125,7 @@ class Extractor:
|
|||
|
||||
_insert(row, "name", self.get_name())
|
||||
_insert(row, "image_url", self.get_image_url())
|
||||
_insert(row, "languages", self.get_languages())
|
||||
_insert(row, "rapid", self.get_rapid())
|
||||
_insert(row, "blitz", self.get_blitz())
|
||||
_insert(row, "bullet", self.get_bullet())
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
import enum
|
||||
|
||||
|
||||
class Site(enum.Enum):
|
||||
CHESSCOM = "chesscom"
|
||||
LICHESS = "lichess"
|
|
@ -0,0 +1,114 @@
|
|||
import enum
|
||||
|
||||
|
||||
class Site(enum.Enum):
|
||||
CHESSCOM = "chesscom"
|
||||
LICHESS = "lichess"
|
||||
|
||||
|
||||
class Language(enum.Enum):
|
||||
en_GB = "English"
|
||||
af_ZA = "Afrikaans"
|
||||
an_ES = "Aragonés"
|
||||
ar_SA = "العربية"
|
||||
as_IN = "অসমীয়া"
|
||||
av_DA = "авар мацӀ"
|
||||
az_AZ = "Azərbaycanca"
|
||||
be_BY = "Беларуская"
|
||||
bg_BG = "български език"
|
||||
bn_BD = "বাংলা"
|
||||
br_FR = "Brezhoneg"
|
||||
bs_BA = "Bosanski"
|
||||
ca_ES = "Català, valencià"
|
||||
ckb_IR = "کوردی سۆرانی"
|
||||
co_FR = "Corsu"
|
||||
cs_CZ = "Čeština"
|
||||
cv_CU = "чӑваш чӗлхи"
|
||||
cy_GB = "Cymraeg"
|
||||
da_DK = "Dansk"
|
||||
de_DE = "Deutsch"
|
||||
el_GR = "Ελληνικά"
|
||||
en_US = "English (US)"
|
||||
eo_UY = "Esperanto"
|
||||
es_ES = "Español"
|
||||
et_EE = "Eesti keel"
|
||||
eu_ES = "Euskara"
|
||||
fa_IR = "فارسی"
|
||||
fi_FI = "Suomen kieli"
|
||||
fo_FO = "Føroyskt"
|
||||
fr_FR = "Français"
|
||||
frp_IT = "Arpitan"
|
||||
fy_NL = "Frysk"
|
||||
ga_IE = "Gaeilge"
|
||||
gd_GB = "Gàidhlig"
|
||||
gl_ES = "Galego"
|
||||
gsw_CH = "Schwizerdütsch"
|
||||
gu_IN = "ગુજરાતી"
|
||||
he_IL = "עִבְרִית"
|
||||
hi_IN = "हिन्दी, हिंदी"
|
||||
hr_HR = "Hrvatski"
|
||||
hu_HU = "Magyar"
|
||||
hy_AM = "Հայերեն"
|
||||
ia_IA = "Interlingua"
|
||||
id_ID = "Bahasa Indonesia"
|
||||
io_EN = "Ido"
|
||||
is_IS = "Íslenska"
|
||||
it_IT = "Italiano"
|
||||
ja_JP = "日本語"
|
||||
jbo_EN = "Lojban"
|
||||
jv_ID = "Basa Jawa"
|
||||
ka_GE = "ქართული"
|
||||
kab_DZ = "Taqvaylit"
|
||||
kk_KZ = "қазақша"
|
||||
kmr_TR = "Kurdî (Kurmancî)"
|
||||
kn_IN = "ಕನ್ನಡ"
|
||||
ko_KR = "한국어"
|
||||
ky_KG = "кыргызча"
|
||||
la_LA = "Lingua Latina"
|
||||
lb_LU = "Lëtzebuergesch"
|
||||
lt_LT = "Lietuvių kalba"
|
||||
lv_LV = "Latviešu valoda"
|
||||
mg_MG = "Fiteny malagasy"
|
||||
mk_MK = "македонски јази"
|
||||
ml_IN = "മലയാളം"
|
||||
mn_MN = "монгол"
|
||||
mr_IN = "मराठी"
|
||||
nb_NO = "Norsk bokmål"
|
||||
ne_NP = "नेपाली"
|
||||
nl_NL = "Nederlands"
|
||||
nn_NO = "Norsk nynorsk"
|
||||
pi_IN = "पालि"
|
||||
pl_PL = "Polski"
|
||||
ps_AF = "پښتو"
|
||||
pt_PT = "Português"
|
||||
pt_BR = "Português (BR)"
|
||||
ro_RO = "Română"
|
||||
ru_RU = "русский язык"
|
||||
ry_UA = "Русинська бисїда"
|
||||
sa_IN = "संस्कृत"
|
||||
sk_SK = "Slovenčina"
|
||||
sl_SI = "Slovenščina"
|
||||
sq_AL = "Shqip"
|
||||
sr_SP = "Српски језик"
|
||||
sv_SE = "Svenska"
|
||||
sw_KE = "Kiswahili"
|
||||
ta_IN = "தமிழ்"
|
||||
tg_TJ = "тоҷикӣ"
|
||||
th_TH = "ไทย"
|
||||
tk_TM = "Türkmençe"
|
||||
tl_PH = "Tagalog"
|
||||
tp_TP = "Toki pona"
|
||||
tr_TR = "Türkçe"
|
||||
uk_UA = "українська"
|
||||
ur_PK = "اُردُو"
|
||||
uz_UZ = "oʻzbekcha"
|
||||
vi_VN = "Tiếng Việt"
|
||||
yo_NG = "Yorùbá"
|
||||
zh_CN = "中文"
|
||||
zh_TW = "繁體中文"
|
||||
zu_ZA = "isiZulu"
|
||||
|
||||
|
||||
code_to_lang = {lang.name.replace("_", "-"): lang.value for lang in Language}
|
||||
|
||||
lang_to_code = {lang.value: lang.name.replace("_", "-") for lang in Language}
|
|
@ -8,6 +8,7 @@ CREATE TABLE coach_scraper.export
|
|||
, username VARCHAR(255) NOT NULL
|
||||
, name VARCHAR(255)
|
||||
, image_url TEXT
|
||||
, languages TEXT[]
|
||||
, rapid INT
|
||||
, blitz INT
|
||||
, bullet INT
|
||||
|
|
Loading…
Reference in New Issue