Scrape languages from lichess listing. (#10)
parent
82dbef21b6
commit
ef5d296097
|
@ -8,7 +8,7 @@ import psycopg2
|
||||||
from app.chesscom import Pipeline as ChesscomPipeline
|
from app.chesscom import Pipeline as ChesscomPipeline
|
||||||
from app.database import backup_database
|
from app.database import backup_database
|
||||||
from app.lichess import Pipeline as LichessPipeline
|
from app.lichess import Pipeline as LichessPipeline
|
||||||
from app.site import Site
|
from app.types import Site
|
||||||
|
|
||||||
# The number of parallel extraction jobs that are run at a time.
|
# The number of parallel extraction jobs that are run at a time.
|
||||||
WORKER_COUNT = 10
|
WORKER_COUNT = 10
|
||||||
|
|
|
@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||||
from app.pipeline import Extractor as BaseExtractor
|
from app.pipeline import Extractor as BaseExtractor
|
||||||
from app.pipeline import Fetcher as BaseFetcher
|
from app.pipeline import Fetcher as BaseFetcher
|
||||||
from app.pipeline import Pipeline as BasePipeline
|
from app.pipeline import Pipeline as BasePipeline
|
||||||
from app.site import Site
|
from app.types import Site
|
||||||
|
|
||||||
# The number of coach listing pages we will at most iterate through. This number
|
# The number of coach listing pages we will at most iterate through. This number
|
||||||
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
|
||||||
|
@ -156,6 +156,10 @@ class Extractor(BaseExtractor):
|
||||||
return None
|
return None
|
||||||
return src
|
return src
|
||||||
|
|
||||||
|
def get_languages(self) -> List[str] | None:
|
||||||
|
# TODO: Extract using huggingface model.
|
||||||
|
return None
|
||||||
|
|
||||||
def get_rapid(self) -> int | None:
|
def get_rapid(self) -> int | None:
|
||||||
return self.stats_json.get("rapid", {}).get("rating")
|
return self.stats_json.get("rapid", {}).get("rating")
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Literal
|
from typing import List, Literal
|
||||||
|
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
from app.site import Site
|
from app.types import Site
|
||||||
|
|
||||||
SCHEMA_NAME = "coach_scraper"
|
SCHEMA_NAME = "coach_scraper"
|
||||||
TABLE_NAME = "export"
|
TABLE_NAME = "export"
|
||||||
|
@ -15,6 +15,7 @@ RowKey = (
|
||||||
| Literal["username"]
|
| Literal["username"]
|
||||||
| Literal["name"]
|
| Literal["name"]
|
||||||
| Literal["image_url"]
|
| Literal["image_url"]
|
||||||
|
| Literal["languages"]
|
||||||
| Literal["rapid"]
|
| Literal["rapid"]
|
||||||
| Literal["blitz"]
|
| Literal["blitz"]
|
||||||
| Literal["bullet"]
|
| Literal["bullet"]
|
||||||
|
@ -35,6 +36,8 @@ class Row(TypedDict, total=False):
|
||||||
name: str
|
name: str
|
||||||
# Profile image used on the source site.
|
# Profile image used on the source site.
|
||||||
image_url: str
|
image_url: str
|
||||||
|
# The list of languages the coach is fluent in.
|
||||||
|
languages: List[str]
|
||||||
# Rapid rating relative to the site they were sourced from.
|
# Rapid rating relative to the site they were sourced from.
|
||||||
rapid: int
|
rapid: int
|
||||||
# Blitz rating relative to the site they were sourced from.
|
# Blitz rating relative to the site they were sourced from.
|
||||||
|
@ -90,6 +93,7 @@ def upsert_row(conn, row: Row):
|
||||||
, username
|
, username
|
||||||
, name
|
, name
|
||||||
, image_url
|
, image_url
|
||||||
|
, languages
|
||||||
, rapid
|
, rapid
|
||||||
, blitz
|
, blitz
|
||||||
, bullet
|
, bullet
|
||||||
|
@ -102,12 +106,14 @@ def upsert_row(conn, row: Row):
|
||||||
, %s
|
, %s
|
||||||
, %s
|
, %s
|
||||||
, %s
|
, %s
|
||||||
|
, %s
|
||||||
)
|
)
|
||||||
ON CONFLICT
|
ON CONFLICT
|
||||||
(site, username)
|
(site, username)
|
||||||
DO UPDATE SET
|
DO UPDATE SET
|
||||||
name = EXCLUDED.name,
|
name = EXCLUDED.name,
|
||||||
image_url = EXCLUDED.image_url,
|
image_url = EXCLUDED.image_url,
|
||||||
|
languages = EXCLUDED.languages,
|
||||||
rapid = EXCLUDED.rapid,
|
rapid = EXCLUDED.rapid,
|
||||||
blitz = EXCLUDED.blitz,
|
blitz = EXCLUDED.blitz,
|
||||||
bullet = EXCLUDED.bullet;
|
bullet = EXCLUDED.bullet;
|
||||||
|
@ -117,6 +123,7 @@ def upsert_row(conn, row: Row):
|
||||||
row["username"],
|
row["username"],
|
||||||
row.get("name"),
|
row.get("name"),
|
||||||
row.get("image_url"),
|
row.get("image_url"),
|
||||||
|
row.get("languages", []),
|
||||||
row.get("rapid"),
|
row.get("rapid"),
|
||||||
row.get("blitz"),
|
row.get("blitz"),
|
||||||
row.get("bullet"),
|
row.get("bullet"),
|
||||||
|
|
|
@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||||
from app.pipeline import Extractor as BaseExtractor
|
from app.pipeline import Extractor as BaseExtractor
|
||||||
from app.pipeline import Fetcher as BaseFetcher
|
from app.pipeline import Fetcher as BaseFetcher
|
||||||
from app.pipeline import Pipeline as BasePipeline
|
from app.pipeline import Pipeline as BasePipeline
|
||||||
from app.site import Site
|
from app.types import Site, lang_to_code
|
||||||
|
|
||||||
# The number of pages we will at most iterate through. This number was
|
# The number of pages we will at most iterate through. This number was
|
||||||
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||||
|
@ -161,6 +161,22 @@ class Extractor(BaseExtractor):
|
||||||
return None
|
return None
|
||||||
return src
|
return src
|
||||||
|
|
||||||
|
def get_languages(self) -> List[str] | None:
|
||||||
|
if self.profile_soup is None:
|
||||||
|
return None
|
||||||
|
tr = self.profile_soup.find("tr", class_="languages")
|
||||||
|
if not isinstance(tr, Tag):
|
||||||
|
return None
|
||||||
|
td = tr.find("td")
|
||||||
|
if not isinstance(td, Tag):
|
||||||
|
return None
|
||||||
|
|
||||||
|
codes = []
|
||||||
|
for lang in [s.strip() for s in tr.get_text().split(",")]:
|
||||||
|
if lang in lang_to_code:
|
||||||
|
codes.append(lang_to_code[lang])
|
||||||
|
return codes
|
||||||
|
|
||||||
def get_rapid(self) -> int | None:
|
def get_rapid(self) -> int | None:
|
||||||
return self._find_rating("rapid")
|
return self._find_rating("rapid")
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from typing import Any, List, Tuple
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
|
||||||
from app.database import Row, RowKey, upsert_row
|
from app.database import Row, RowKey, upsert_row
|
||||||
from app.site import Site
|
from app.types import Site
|
||||||
|
|
||||||
|
|
||||||
class Fetcher:
|
class Fetcher:
|
||||||
|
@ -104,6 +104,9 @@ class Extractor:
|
||||||
def get_image_url(self) -> str | None:
|
def get_image_url(self) -> str | None:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def get_languages(self) -> List[str] | None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_rapid(self) -> int | None:
|
def get_rapid(self) -> int | None:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -122,6 +125,7 @@ class Extractor:
|
||||||
|
|
||||||
_insert(row, "name", self.get_name())
|
_insert(row, "name", self.get_name())
|
||||||
_insert(row, "image_url", self.get_image_url())
|
_insert(row, "image_url", self.get_image_url())
|
||||||
|
_insert(row, "languages", self.get_languages())
|
||||||
_insert(row, "rapid", self.get_rapid())
|
_insert(row, "rapid", self.get_rapid())
|
||||||
_insert(row, "blitz", self.get_blitz())
|
_insert(row, "blitz", self.get_blitz())
|
||||||
_insert(row, "bullet", self.get_bullet())
|
_insert(row, "bullet", self.get_bullet())
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
import enum
|
|
||||||
|
|
||||||
|
|
||||||
class Site(enum.Enum):
|
|
||||||
CHESSCOM = "chesscom"
|
|
||||||
LICHESS = "lichess"
|
|
|
@ -0,0 +1,114 @@
|
||||||
|
import enum
|
||||||
|
|
||||||
|
|
||||||
|
class Site(enum.Enum):
|
||||||
|
CHESSCOM = "chesscom"
|
||||||
|
LICHESS = "lichess"
|
||||||
|
|
||||||
|
|
||||||
|
class Language(enum.Enum):
|
||||||
|
en_GB = "English"
|
||||||
|
af_ZA = "Afrikaans"
|
||||||
|
an_ES = "Aragonés"
|
||||||
|
ar_SA = "العربية"
|
||||||
|
as_IN = "অসমীয়া"
|
||||||
|
av_DA = "авар мацӀ"
|
||||||
|
az_AZ = "Azərbaycanca"
|
||||||
|
be_BY = "Беларуская"
|
||||||
|
bg_BG = "български език"
|
||||||
|
bn_BD = "বাংলা"
|
||||||
|
br_FR = "Brezhoneg"
|
||||||
|
bs_BA = "Bosanski"
|
||||||
|
ca_ES = "Català, valencià"
|
||||||
|
ckb_IR = "کوردی سۆرانی"
|
||||||
|
co_FR = "Corsu"
|
||||||
|
cs_CZ = "Čeština"
|
||||||
|
cv_CU = "чӑваш чӗлхи"
|
||||||
|
cy_GB = "Cymraeg"
|
||||||
|
da_DK = "Dansk"
|
||||||
|
de_DE = "Deutsch"
|
||||||
|
el_GR = "Ελληνικά"
|
||||||
|
en_US = "English (US)"
|
||||||
|
eo_UY = "Esperanto"
|
||||||
|
es_ES = "Español"
|
||||||
|
et_EE = "Eesti keel"
|
||||||
|
eu_ES = "Euskara"
|
||||||
|
fa_IR = "فارسی"
|
||||||
|
fi_FI = "Suomen kieli"
|
||||||
|
fo_FO = "Føroyskt"
|
||||||
|
fr_FR = "Français"
|
||||||
|
frp_IT = "Arpitan"
|
||||||
|
fy_NL = "Frysk"
|
||||||
|
ga_IE = "Gaeilge"
|
||||||
|
gd_GB = "Gàidhlig"
|
||||||
|
gl_ES = "Galego"
|
||||||
|
gsw_CH = "Schwizerdütsch"
|
||||||
|
gu_IN = "ગુજરાતી"
|
||||||
|
he_IL = "עִבְרִית"
|
||||||
|
hi_IN = "हिन्दी, हिंदी"
|
||||||
|
hr_HR = "Hrvatski"
|
||||||
|
hu_HU = "Magyar"
|
||||||
|
hy_AM = "Հայերեն"
|
||||||
|
ia_IA = "Interlingua"
|
||||||
|
id_ID = "Bahasa Indonesia"
|
||||||
|
io_EN = "Ido"
|
||||||
|
is_IS = "Íslenska"
|
||||||
|
it_IT = "Italiano"
|
||||||
|
ja_JP = "日本語"
|
||||||
|
jbo_EN = "Lojban"
|
||||||
|
jv_ID = "Basa Jawa"
|
||||||
|
ka_GE = "ქართული"
|
||||||
|
kab_DZ = "Taqvaylit"
|
||||||
|
kk_KZ = "қазақша"
|
||||||
|
kmr_TR = "Kurdî (Kurmancî)"
|
||||||
|
kn_IN = "ಕನ್ನಡ"
|
||||||
|
ko_KR = "한국어"
|
||||||
|
ky_KG = "кыргызча"
|
||||||
|
la_LA = "Lingua Latina"
|
||||||
|
lb_LU = "Lëtzebuergesch"
|
||||||
|
lt_LT = "Lietuvių kalba"
|
||||||
|
lv_LV = "Latviešu valoda"
|
||||||
|
mg_MG = "Fiteny malagasy"
|
||||||
|
mk_MK = "македонски јази"
|
||||||
|
ml_IN = "മലയാളം"
|
||||||
|
mn_MN = "монгол"
|
||||||
|
mr_IN = "मराठी"
|
||||||
|
nb_NO = "Norsk bokmål"
|
||||||
|
ne_NP = "नेपाली"
|
||||||
|
nl_NL = "Nederlands"
|
||||||
|
nn_NO = "Norsk nynorsk"
|
||||||
|
pi_IN = "पालि"
|
||||||
|
pl_PL = "Polski"
|
||||||
|
ps_AF = "پښتو"
|
||||||
|
pt_PT = "Português"
|
||||||
|
pt_BR = "Português (BR)"
|
||||||
|
ro_RO = "Română"
|
||||||
|
ru_RU = "русский язык"
|
||||||
|
ry_UA = "Русинська бисїда"
|
||||||
|
sa_IN = "संस्कृत"
|
||||||
|
sk_SK = "Slovenčina"
|
||||||
|
sl_SI = "Slovenščina"
|
||||||
|
sq_AL = "Shqip"
|
||||||
|
sr_SP = "Српски језик"
|
||||||
|
sv_SE = "Svenska"
|
||||||
|
sw_KE = "Kiswahili"
|
||||||
|
ta_IN = "தமிழ்"
|
||||||
|
tg_TJ = "тоҷикӣ"
|
||||||
|
th_TH = "ไทย"
|
||||||
|
tk_TM = "Türkmençe"
|
||||||
|
tl_PH = "Tagalog"
|
||||||
|
tp_TP = "Toki pona"
|
||||||
|
tr_TR = "Türkçe"
|
||||||
|
uk_UA = "українська"
|
||||||
|
ur_PK = "اُردُو"
|
||||||
|
uz_UZ = "oʻzbekcha"
|
||||||
|
vi_VN = "Tiếng Việt"
|
||||||
|
yo_NG = "Yorùbá"
|
||||||
|
zh_CN = "中文"
|
||||||
|
zh_TW = "繁體中文"
|
||||||
|
zu_ZA = "isiZulu"
|
||||||
|
|
||||||
|
|
||||||
|
code_to_lang = {lang.name.replace("_", "-"): lang.value for lang in Language}
|
||||||
|
|
||||||
|
lang_to_code = {lang.value: lang.name.replace("_", "-") for lang in Language}
|
|
@ -8,6 +8,7 @@ CREATE TABLE coach_scraper.export
|
||||||
, username VARCHAR(255) NOT NULL
|
, username VARCHAR(255) NOT NULL
|
||||||
, name VARCHAR(255)
|
, name VARCHAR(255)
|
||||||
, image_url TEXT
|
, image_url TEXT
|
||||||
|
, languages TEXT[]
|
||||||
, rapid INT
|
, rapid INT
|
||||||
, blitz INT
|
, blitz INT
|
||||||
, bullet INT
|
, bullet INT
|
||||||
|
|
Loading…
Reference in New Issue