Scrape languages from lichess listing. (#10)

main
Joshua Potter 2023-12-05 14:20:46 -07:00 committed by GitHub
parent 82dbef21b6
commit ef5d296097
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 152 additions and 12 deletions

View File

@ -8,7 +8,7 @@ import psycopg2
from app.chesscom import Pipeline as ChesscomPipeline
from app.database import backup_database
from app.lichess import Pipeline as LichessPipeline
from app.site import Site
from app.types import Site
# The number of parallel extraction jobs that are run at a time.
WORKER_COUNT = 10

View File

@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.site import Site
from app.types import Site
# The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -156,6 +156,10 @@ class Extractor(BaseExtractor):
return None
return src
def get_languages(self) -> List[str] | None:
# TODO: Extract using huggingface model.
return None
def get_rapid(self) -> int | None:
return self.stats_json.get("rapid", {}).get("rating")

View File

@ -1,10 +1,10 @@
import sys
from datetime import datetime
from typing import Literal
from typing import List, Literal
from typing_extensions import TypedDict
from app.site import Site
from app.types import Site
SCHEMA_NAME = "coach_scraper"
TABLE_NAME = "export"
@ -15,6 +15,7 @@ RowKey = (
| Literal["username"]
| Literal["name"]
| Literal["image_url"]
| Literal["languages"]
| Literal["rapid"]
| Literal["blitz"]
| Literal["bullet"]
@ -35,6 +36,8 @@ class Row(TypedDict, total=False):
name: str
# Profile image used on the source site.
image_url: str
# The list of languages the coach is fluent in.
languages: List[str]
# Rapid rating relative to the site they were sourced from.
rapid: int
# Blitz rating relative to the site they were sourced from.
@ -90,6 +93,7 @@ def upsert_row(conn, row: Row):
, username
, name
, image_url
, languages
, rapid
, blitz
, bullet
@ -102,12 +106,14 @@ def upsert_row(conn, row: Row):
, %s
, %s
, %s
, %s
)
ON CONFLICT
(site, username)
DO UPDATE SET
name = EXCLUDED.name,
image_url = EXCLUDED.image_url,
languages = EXCLUDED.languages,
rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz,
bullet = EXCLUDED.bullet;
@ -117,6 +123,7 @@ def upsert_row(conn, row: Row):
row["username"],
row.get("name"),
row.get("image_url"),
row.get("languages", []),
row.get("rapid"),
row.get("blitz"),
row.get("bullet"),

View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline
from app.site import Site
from app.types import Site, lang_to_code
# The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical
@ -161,6 +161,22 @@ class Extractor(BaseExtractor):
return None
return src
def get_languages(self) -> List[str] | None:
if self.profile_soup is None:
return None
tr = self.profile_soup.find("tr", class_="languages")
if not isinstance(tr, Tag):
return None
td = tr.find("td")
if not isinstance(td, Tag):
return None
codes = []
for lang in [s.strip() for s in tr.get_text().split(",")]:
if lang in lang_to_code:
codes.append(lang_to_code[lang])
return codes
def get_rapid(self) -> int | None:
return self._find_rating("rapid")

View File

@ -5,7 +5,7 @@ from typing import Any, List, Tuple
import aiohttp
from app.database import Row, RowKey, upsert_row
from app.site import Site
from app.types import Site
class Fetcher:
@ -104,6 +104,9 @@ class Extractor:
def get_image_url(self) -> str | None:
raise NotImplementedError()
def get_languages(self) -> List[str] | None:
raise NotImplementedError()
def get_rapid(self) -> int | None:
raise NotImplementedError()
@ -122,6 +125,7 @@ class Extractor:
_insert(row, "name", self.get_name())
_insert(row, "image_url", self.get_image_url())
_insert(row, "languages", self.get_languages())
_insert(row, "rapid", self.get_rapid())
_insert(row, "blitz", self.get_blitz())
_insert(row, "bullet", self.get_bullet())

View File

@ -1,6 +0,0 @@
import enum
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"

114
app/types.py Normal file
View File

@ -0,0 +1,114 @@
import enum
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class Language(enum.Enum):
en_GB = "English"
af_ZA = "Afrikaans"
an_ES = "Aragonés"
ar_SA = "العربية"
as_IN = "অসমীয়া"
av_DA = "авар мацӀ"
az_AZ = "Azərbaycanca"
be_BY = "Беларуская"
bg_BG = "български език"
bn_BD = "বাংলা"
br_FR = "Brezhoneg"
bs_BA = "Bosanski"
ca_ES = "Català, valencià"
ckb_IR = "کوردی سۆرانی"
co_FR = "Corsu"
cs_CZ = "Čeština"
cv_CU = "чӑваш чӗлхи"
cy_GB = "Cymraeg"
da_DK = "Dansk"
de_DE = "Deutsch"
el_GR = "Ελληνικά"
en_US = "English (US)"
eo_UY = "Esperanto"
es_ES = "Español"
et_EE = "Eesti keel"
eu_ES = "Euskara"
fa_IR = "فارسی"
fi_FI = "Suomen kieli"
fo_FO = "Føroyskt"
fr_FR = "Français"
frp_IT = "Arpitan"
fy_NL = "Frysk"
ga_IE = "Gaeilge"
gd_GB = "Gàidhlig"
gl_ES = "Galego"
gsw_CH = "Schwizerdütsch"
gu_IN = "ગુજરાતી"
he_IL = "עִבְרִית"
hi_IN = "हिन्दी, हिंदी"
hr_HR = "Hrvatski"
hu_HU = "Magyar"
hy_AM = "Հայերեն"
ia_IA = "Interlingua"
id_ID = "Bahasa Indonesia"
io_EN = "Ido"
is_IS = "Íslenska"
it_IT = "Italiano"
ja_JP = "日本語"
jbo_EN = "Lojban"
jv_ID = "Basa Jawa"
ka_GE = "ქართული"
kab_DZ = "Taqvaylit"
kk_KZ = "қазақша"
kmr_TR = "Kurdî (Kurmancî)"
kn_IN = "ಕನ್ನಡ"
ko_KR = "한국어"
ky_KG = "кыргызча"
la_LA = "Lingua Latina"
lb_LU = "Lëtzebuergesch"
lt_LT = "Lietuvių kalba"
lv_LV = "Latviešu valoda"
mg_MG = "Fiteny malagasy"
mk_MK = "македонски јази"
ml_IN = "മലയാളം"
mn_MN = "монгол"
mr_IN = "मराठी"
nb_NO = "Norsk bokmål"
ne_NP = "नेपाली"
nl_NL = "Nederlands"
nn_NO = "Norsk nynorsk"
pi_IN = "पालि"
pl_PL = "Polski"
ps_AF = "پښتو"
pt_PT = "Português"
pt_BR = "Português (BR)"
ro_RO = "Română"
ru_RU = "русский язык"
ry_UA = "Русинська бисїда"
sa_IN = "संस्कृत"
sk_SK = "Slovenčina"
sl_SI = "Slovenščina"
sq_AL = "Shqip"
sr_SP = "Српски језик"
sv_SE = "Svenska"
sw_KE = "Kiswahili"
ta_IN = "தமிழ்"
tg_TJ = "тоҷикӣ"
th_TH = "ไทย"
tk_TM = "Türkmençe"
tl_PH = "Tagalog"
tp_TP = "Toki pona"
tr_TR = "Türkçe"
uk_UA = "українська"
ur_PK = "اُردُو"
uz_UZ = "oʻzbekcha"
vi_VN = "Tiếng Việt"
yo_NG = "Yorùbá"
zh_CN = "中文"
zh_TW = "繁體中文"
zu_ZA = "isiZulu"
code_to_lang = {lang.name.replace("_", "-"): lang.value for lang in Language}
lang_to_code = {lang.value: lang.name.replace("_", "-") for lang in Language}

View File

@ -8,6 +8,7 @@ CREATE TABLE coach_scraper.export
, username VARCHAR(255) NOT NULL
, name VARCHAR(255)
, image_url TEXT
, languages TEXT[]
, rapid INT
, blitz INT
, bullet INT