From ef5d2960975bf0e1eac704d1417eded2d191910a Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Tue, 5 Dec 2023 14:20:46 -0700 Subject: [PATCH] Scrape languages from lichess listing. (#10) --- app/__main__.py | 2 +- app/chesscom.py | 6 ++- app/database.py | 11 ++++- app/lichess.py | 18 +++++++- app/pipeline.py | 6 ++- app/site.py | 6 --- app/types.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ sql/init.sql | 1 + 8 files changed, 152 insertions(+), 12 deletions(-) delete mode 100644 app/site.py create mode 100644 app/types.py diff --git a/app/__main__.py b/app/__main__.py index 7ee542b..99f6714 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -8,7 +8,7 @@ import psycopg2 from app.chesscom import Pipeline as ChesscomPipeline from app.database import backup_database from app.lichess import Pipeline as LichessPipeline -from app.site import Site +from app.types import Site # The number of parallel extraction jobs that are run at a time. WORKER_COUNT = 10 diff --git a/app/chesscom.py b/app/chesscom.py index 98129b0..ea0ef3a 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -10,7 +10,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.site import Site +from app.types import Site # The number of coach listing pages we will at most iterate through. This number # was determined by going to chess.com/coaches?sortBy=alphabetical&page=1 and @@ -156,6 +156,10 @@ class Extractor(BaseExtractor): return None return src + def get_languages(self) -> List[str] | None: + # TODO: Extract using huggingface model. + return None + def get_rapid(self) -> int | None: return self.stats_json.get("rapid", {}).get("rating") diff --git a/app/database.py b/app/database.py index 6b4275d..1e7188a 100644 --- a/app/database.py +++ b/app/database.py @@ -1,10 +1,10 @@ import sys from datetime import datetime -from typing import Literal +from typing import List, Literal from typing_extensions import TypedDict -from app.site import Site +from app.types import Site SCHEMA_NAME = "coach_scraper" TABLE_NAME = "export" @@ -15,6 +15,7 @@ RowKey = ( | Literal["username"] | Literal["name"] | Literal["image_url"] + | Literal["languages"] | Literal["rapid"] | Literal["blitz"] | Literal["bullet"] @@ -35,6 +36,8 @@ class Row(TypedDict, total=False): name: str # Profile image used on the source site. image_url: str + # The list of languages the coach is fluent in. + languages: List[str] # Rapid rating relative to the site they were sourced from. rapid: int # Blitz rating relative to the site they were sourced from. @@ -90,6 +93,7 @@ def upsert_row(conn, row: Row): , username , name , image_url + , languages , rapid , blitz , bullet @@ -102,12 +106,14 @@ def upsert_row(conn, row: Row): , %s , %s , %s + , %s ) ON CONFLICT (site, username) DO UPDATE SET name = EXCLUDED.name, image_url = EXCLUDED.image_url, + languages = EXCLUDED.languages, rapid = EXCLUDED.rapid, blitz = EXCLUDED.blitz, bullet = EXCLUDED.bullet; @@ -117,6 +123,7 @@ def upsert_row(conn, row: Row): row["username"], row.get("name"), row.get("image_url"), + row.get("languages", []), row.get("rapid"), row.get("blitz"), row.get("bullet"), diff --git a/app/lichess.py b/app/lichess.py index e7e14e5..e830ee8 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -9,7 +9,7 @@ from bs4 import BeautifulSoup, SoupStrainer, Tag from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.site import Site +from app.types import Site, lang_to_code # The number of pages we will at most iterate through. This number was # determined by going to https://lichess.org/coach/all/all/alphabetical @@ -161,6 +161,22 @@ class Extractor(BaseExtractor): return None return src + def get_languages(self) -> List[str] | None: + if self.profile_soup is None: + return None + tr = self.profile_soup.find("tr", class_="languages") + if not isinstance(tr, Tag): + return None + td = tr.find("td") + if not isinstance(td, Tag): + return None + + codes = [] + for lang in [s.strip() for s in tr.get_text().split(",")]: + if lang in lang_to_code: + codes.append(lang_to_code[lang]) + return codes + def get_rapid(self) -> int | None: return self._find_rating("rapid") diff --git a/app/pipeline.py b/app/pipeline.py index a9280a6..4292ee8 100644 --- a/app/pipeline.py +++ b/app/pipeline.py @@ -5,7 +5,7 @@ from typing import Any, List, Tuple import aiohttp from app.database import Row, RowKey, upsert_row -from app.site import Site +from app.types import Site class Fetcher: @@ -104,6 +104,9 @@ class Extractor: def get_image_url(self) -> str | None: raise NotImplementedError() + def get_languages(self) -> List[str] | None: + raise NotImplementedError() + def get_rapid(self) -> int | None: raise NotImplementedError() @@ -122,6 +125,7 @@ class Extractor: _insert(row, "name", self.get_name()) _insert(row, "image_url", self.get_image_url()) + _insert(row, "languages", self.get_languages()) _insert(row, "rapid", self.get_rapid()) _insert(row, "blitz", self.get_blitz()) _insert(row, "bullet", self.get_bullet()) diff --git a/app/site.py b/app/site.py deleted file mode 100644 index ce5eb48..0000000 --- a/app/site.py +++ /dev/null @@ -1,6 +0,0 @@ -import enum - - -class Site(enum.Enum): - CHESSCOM = "chesscom" - LICHESS = "lichess" diff --git a/app/types.py b/app/types.py new file mode 100644 index 0000000..7014eca --- /dev/null +++ b/app/types.py @@ -0,0 +1,114 @@ +import enum + + +class Site(enum.Enum): + CHESSCOM = "chesscom" + LICHESS = "lichess" + + +class Language(enum.Enum): + en_GB = "English" + af_ZA = "Afrikaans" + an_ES = "Aragonés" + ar_SA = "العربية" + as_IN = "অসমীয়া" + av_DA = "авар мацӀ" + az_AZ = "Azərbaycanca" + be_BY = "Беларуская" + bg_BG = "български език" + bn_BD = "বাংলা" + br_FR = "Brezhoneg" + bs_BA = "Bosanski" + ca_ES = "Català, valencià" + ckb_IR = "کوردی سۆرانی" + co_FR = "Corsu" + cs_CZ = "Čeština" + cv_CU = "чӑваш чӗлхи" + cy_GB = "Cymraeg" + da_DK = "Dansk" + de_DE = "Deutsch" + el_GR = "Ελληνικά" + en_US = "English (US)" + eo_UY = "Esperanto" + es_ES = "Español" + et_EE = "Eesti keel" + eu_ES = "Euskara" + fa_IR = "فارسی" + fi_FI = "Suomen kieli" + fo_FO = "Føroyskt" + fr_FR = "Français" + frp_IT = "Arpitan" + fy_NL = "Frysk" + ga_IE = "Gaeilge" + gd_GB = "Gàidhlig" + gl_ES = "Galego" + gsw_CH = "Schwizerdütsch" + gu_IN = "ગુજરાતી" + he_IL = "עִבְרִית" + hi_IN = "हिन्दी, हिंदी" + hr_HR = "Hrvatski" + hu_HU = "Magyar" + hy_AM = "Հայերեն" + ia_IA = "Interlingua" + id_ID = "Bahasa Indonesia" + io_EN = "Ido" + is_IS = "Íslenska" + it_IT = "Italiano" + ja_JP = "日本語" + jbo_EN = "Lojban" + jv_ID = "Basa Jawa" + ka_GE = "ქართული" + kab_DZ = "Taqvaylit" + kk_KZ = "қазақша" + kmr_TR = "Kurdî (Kurmancî)" + kn_IN = "ಕನ್ನಡ" + ko_KR = "한국어" + ky_KG = "кыргызча" + la_LA = "Lingua Latina" + lb_LU = "Lëtzebuergesch" + lt_LT = "Lietuvių kalba" + lv_LV = "Latviešu valoda" + mg_MG = "Fiteny malagasy" + mk_MK = "македонски јази" + ml_IN = "മലയാളം" + mn_MN = "монгол" + mr_IN = "मराठी" + nb_NO = "Norsk bokmål" + ne_NP = "नेपाली" + nl_NL = "Nederlands" + nn_NO = "Norsk nynorsk" + pi_IN = "पालि" + pl_PL = "Polski" + ps_AF = "پښتو" + pt_PT = "Português" + pt_BR = "Português (BR)" + ro_RO = "Română" + ru_RU = "русский язык" + ry_UA = "Русинська бисїда" + sa_IN = "संस्कृत" + sk_SK = "Slovenčina" + sl_SI = "Slovenščina" + sq_AL = "Shqip" + sr_SP = "Српски језик" + sv_SE = "Svenska" + sw_KE = "Kiswahili" + ta_IN = "தமிழ்" + tg_TJ = "тоҷикӣ" + th_TH = "ไทย" + tk_TM = "Türkmençe" + tl_PH = "Tagalog" + tp_TP = "Toki pona" + tr_TR = "Türkçe" + uk_UA = "українська" + ur_PK = "اُردُو" + uz_UZ = "oʻzbekcha" + vi_VN = "Tiếng Việt" + yo_NG = "Yorùbá" + zh_CN = "中文" + zh_TW = "繁體中文" + zu_ZA = "isiZulu" + + +code_to_lang = {lang.name.replace("_", "-"): lang.value for lang in Language} + +lang_to_code = {lang.value: lang.name.replace("_", "-") for lang in Language} diff --git a/sql/init.sql b/sql/init.sql index ccefcde..4dcb717 100644 --- a/sql/init.sql +++ b/sql/init.sql @@ -8,6 +8,7 @@ CREATE TABLE coach_scraper.export , username VARCHAR(255) NOT NULL , name VARCHAR(255) , image_url TEXT + , languages TEXT[] , rapid INT , blitz INT , bullet INT