Scrape languages from lichess listing. (#10)

main
Joshua Potter 2023-12-05 14:20:46 -07:00 committed by GitHub
parent 82dbef21b6
commit ef5d296097
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 152 additions and 12 deletions

View File

@ -8,7 +8,7 @@ import psycopg2
from app.chesscom import Pipeline as ChesscomPipeline from app.chesscom import Pipeline as ChesscomPipeline
from app.database import backup_database from app.database import backup_database
from app.lichess import Pipeline as LichessPipeline from app.lichess import Pipeline as LichessPipeline
from app.site import Site from app.types import Site
# The number of parallel extraction jobs that are run at a time. # The number of parallel extraction jobs that are run at a time.
WORKER_COUNT = 10 WORKER_COUNT = 10

View File

@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline from app.pipeline import Pipeline as BasePipeline
from app.site import Site from app.types import Site
# The number of coach listing pages we will at most iterate through. This number # The number of coach listing pages we will at most iterate through. This number
# was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and
@ -156,6 +156,10 @@ class Extractor(BaseExtractor):
return None return None
return src return src
def get_languages(self) -> List[str] | None:
# TODO: Extract using huggingface model.
return None
def get_rapid(self) -> int | None: def get_rapid(self) -> int | None:
return self.stats_json.get("rapid", {}).get("rating") return self.stats_json.get("rapid", {}).get("rating")

View File

@ -1,10 +1,10 @@
import sys import sys
from datetime import datetime from datetime import datetime
from typing import Literal from typing import List, Literal
from typing_extensions import TypedDict from typing_extensions import TypedDict
from app.site import Site from app.types import Site
SCHEMA_NAME = "coach_scraper" SCHEMA_NAME = "coach_scraper"
TABLE_NAME = "export" TABLE_NAME = "export"
@ -15,6 +15,7 @@ RowKey = (
| Literal["username"] | Literal["username"]
| Literal["name"] | Literal["name"]
| Literal["image_url"] | Literal["image_url"]
| Literal["languages"]
| Literal["rapid"] | Literal["rapid"]
| Literal["blitz"] | Literal["blitz"]
| Literal["bullet"] | Literal["bullet"]
@ -35,6 +36,8 @@ class Row(TypedDict, total=False):
name: str name: str
# Profile image used on the source site. # Profile image used on the source site.
image_url: str image_url: str
# The list of languages the coach is fluent in.
languages: List[str]
# Rapid rating relative to the site they were sourced from. # Rapid rating relative to the site they were sourced from.
rapid: int rapid: int
# Blitz rating relative to the site they were sourced from. # Blitz rating relative to the site they were sourced from.
@ -90,6 +93,7 @@ def upsert_row(conn, row: Row):
, username , username
, name , name
, image_url , image_url
, languages
, rapid , rapid
, blitz , blitz
, bullet , bullet
@ -102,12 +106,14 @@ def upsert_row(conn, row: Row):
, %s , %s
, %s , %s
, %s , %s
, %s
) )
ON CONFLICT ON CONFLICT
(site, username) (site, username)
DO UPDATE SET DO UPDATE SET
name = EXCLUDED.name, name = EXCLUDED.name,
image_url = EXCLUDED.image_url, image_url = EXCLUDED.image_url,
languages = EXCLUDED.languages,
rapid = EXCLUDED.rapid, rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz, blitz = EXCLUDED.blitz,
bullet = EXCLUDED.bullet; bullet = EXCLUDED.bullet;
@ -117,6 +123,7 @@ def upsert_row(conn, row: Row):
row["username"], row["username"],
row.get("name"), row.get("name"),
row.get("image_url"), row.get("image_url"),
row.get("languages", []),
row.get("rapid"), row.get("rapid"),
row.get("blitz"), row.get("blitz"),
row.get("bullet"), row.get("bullet"),

View File

@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag
from app.pipeline import Extractor as BaseExtractor from app.pipeline import Extractor as BaseExtractor
from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Fetcher as BaseFetcher
from app.pipeline import Pipeline as BasePipeline from app.pipeline import Pipeline as BasePipeline
from app.site import Site from app.types import Site, lang_to_code
# The number of pages we will at most iterate through. This number was # The number of pages we will at most iterate through. This number was
# determined by going to https://lichess.org/coach/all/all/alphabetical # determined by going to https://lichess.org/coach/all/all/alphabetical
@ -161,6 +161,22 @@ class Extractor(BaseExtractor):
return None return None
return src return src
def get_languages(self) -> List[str] | None:
if self.profile_soup is None:
return None
tr = self.profile_soup.find("tr", class_="languages")
if not isinstance(tr, Tag):
return None
td = tr.find("td")
if not isinstance(td, Tag):
return None
codes = []
for lang in [s.strip() for s in tr.get_text().split(",")]:
if lang in lang_to_code:
codes.append(lang_to_code[lang])
return codes
def get_rapid(self) -> int | None: def get_rapid(self) -> int | None:
return self._find_rating("rapid") return self._find_rating("rapid")

View File

@ -5,7 +5,7 @@ from typing import Any, List, Tuple
import aiohttp import aiohttp
from app.database import Row, RowKey, upsert_row from app.database import Row, RowKey, upsert_row
from app.site import Site from app.types import Site
class Fetcher: class Fetcher:
@ -104,6 +104,9 @@ class Extractor:
def get_image_url(self) -> str | None: def get_image_url(self) -> str | None:
raise NotImplementedError() raise NotImplementedError()
def get_languages(self) -> List[str] | None:
raise NotImplementedError()
def get_rapid(self) -> int | None: def get_rapid(self) -> int | None:
raise NotImplementedError() raise NotImplementedError()
@ -122,6 +125,7 @@ class Extractor:
_insert(row, "name", self.get_name()) _insert(row, "name", self.get_name())
_insert(row, "image_url", self.get_image_url()) _insert(row, "image_url", self.get_image_url())
_insert(row, "languages", self.get_languages())
_insert(row, "rapid", self.get_rapid()) _insert(row, "rapid", self.get_rapid())
_insert(row, "blitz", self.get_blitz()) _insert(row, "blitz", self.get_blitz())
_insert(row, "bullet", self.get_bullet()) _insert(row, "bullet", self.get_bullet())

View File

@ -1,6 +0,0 @@
import enum
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"

114
app/types.py Normal file
View File

@ -0,0 +1,114 @@
import enum
class Site(enum.Enum):
CHESSCOM = "chesscom"
LICHESS = "lichess"
class Language(enum.Enum):
en_GB = "English"
af_ZA = "Afrikaans"
an_ES = "Aragonés"
ar_SA = "العربية"
as_IN = "অসমীয়া"
av_DA = "авар мацӀ"
az_AZ = "Azərbaycanca"
be_BY = "Беларуская"
bg_BG = "български език"
bn_BD = "বাংলা"
br_FR = "Brezhoneg"
bs_BA = "Bosanski"
ca_ES = "Català, valencià"
ckb_IR = "کوردی سۆرانی"
co_FR = "Corsu"
cs_CZ = "Čeština"
cv_CU = "чӑваш чӗлхи"
cy_GB = "Cymraeg"
da_DK = "Dansk"
de_DE = "Deutsch"
el_GR = "Ελληνικά"
en_US = "English (US)"
eo_UY = "Esperanto"
es_ES = "Español"
et_EE = "Eesti keel"
eu_ES = "Euskara"
fa_IR = "فارسی"
fi_FI = "Suomen kieli"
fo_FO = "Føroyskt"
fr_FR = "Français"
frp_IT = "Arpitan"
fy_NL = "Frysk"
ga_IE = "Gaeilge"
gd_GB = "Gàidhlig"
gl_ES = "Galego"
gsw_CH = "Schwizerdütsch"
gu_IN = "ગુજરાતી"
he_IL = "עִבְרִית"
hi_IN = "हिन्दी, हिंदी"
hr_HR = "Hrvatski"
hu_HU = "Magyar"
hy_AM = "Հայերեն"
ia_IA = "Interlingua"
id_ID = "Bahasa Indonesia"
io_EN = "Ido"
is_IS = "Íslenska"
it_IT = "Italiano"
ja_JP = "日本語"
jbo_EN = "Lojban"
jv_ID = "Basa Jawa"
ka_GE = "ქართული"
kab_DZ = "Taqvaylit"
kk_KZ = "қазақша"
kmr_TR = "Kurdî (Kurmancî)"
kn_IN = "ಕನ್ನಡ"
ko_KR = "한국어"
ky_KG = "кыргызча"
la_LA = "Lingua Latina"
lb_LU = "Lëtzebuergesch"
lt_LT = "Lietuvių kalba"
lv_LV = "Latviešu valoda"
mg_MG = "Fiteny malagasy"
mk_MK = "македонски јази"
ml_IN = "മലയാളം"
mn_MN = "монгол"
mr_IN = "मराठी"
nb_NO = "Norsk bokmål"
ne_NP = "नेपाली"
nl_NL = "Nederlands"
nn_NO = "Norsk nynorsk"
pi_IN = "पालि"
pl_PL = "Polski"
ps_AF = "پښتو"
pt_PT = "Português"
pt_BR = "Português (BR)"
ro_RO = "Română"
ru_RU = "русский язык"
ry_UA = "Русинська бисїда"
sa_IN = "संस्कृत"
sk_SK = "Slovenčina"
sl_SI = "Slovenščina"
sq_AL = "Shqip"
sr_SP = "Српски језик"
sv_SE = "Svenska"
sw_KE = "Kiswahili"
ta_IN = "தமிழ்"
tg_TJ = "тоҷикӣ"
th_TH = "ไทย"
tk_TM = "Türkmençe"
tl_PH = "Tagalog"
tp_TP = "Toki pona"
tr_TR = "Türkçe"
uk_UA = "українська"
ur_PK = "اُردُو"
uz_UZ = "oʻzbekcha"
vi_VN = "Tiếng Việt"
yo_NG = "Yorùbá"
zh_CN = "中文"
zh_TW = "繁體中文"
zu_ZA = "isiZulu"
code_to_lang = {lang.name.replace("_", "-"): lang.value for lang in Language}
lang_to_code = {lang.value: lang.name.replace("_", "-") for lang in Language}

View File

@ -8,6 +8,7 @@ CREATE TABLE coach_scraper.export
, username VARCHAR(255) NOT NULL , username VARCHAR(255) NOT NULL
, name VARCHAR(255) , name VARCHAR(255)
, image_url TEXT , image_url TEXT
, languages TEXT[]
, rapid INT , rapid INT
, blitz INT , blitz INT
, bullet INT , bullet INT