Add language detection for chesscom profiles.
parent
f2fd289225
commit
44a18fc59c
|
@ -4,6 +4,7 @@ from typing import List
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
from lingua import LanguageDetector, LanguageDetectorBuilder
|
||||||
|
|
||||||
from app.chesscom import Pipeline as ChesscomPipeline
|
from app.chesscom import Pipeline as ChesscomPipeline
|
||||||
from app.database import backup_database, load_languages
|
from app.database import backup_database, load_languages
|
||||||
|
@ -14,21 +15,31 @@ from app.types import Site
|
||||||
WORKER_COUNT = 10
|
WORKER_COUNT = 10
|
||||||
|
|
||||||
|
|
||||||
async def _process(site: Site, conn, session: aiohttp.ClientSession):
|
async def _process(
|
||||||
|
site: Site, conn, detector: LanguageDetector, session: aiohttp.ClientSession
|
||||||
|
):
|
||||||
if site == Site.CHESSCOM:
|
if site == Site.CHESSCOM:
|
||||||
await ChesscomPipeline(worker_count=WORKER_COUNT).process(conn, session)
|
await ChesscomPipeline(worker_count=WORKER_COUNT).process(
|
||||||
|
conn, detector, session
|
||||||
|
)
|
||||||
elif site == Site.LICHESS:
|
elif site == Site.LICHESS:
|
||||||
await LichessPipeline(worker_count=WORKER_COUNT).process(conn, session)
|
await LichessPipeline(worker_count=WORKER_COUNT).process(
|
||||||
|
conn, detector, session
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
assert False, f"Encountered unknown site: {site}."
|
assert False, f"Encountered unknown site: {site}."
|
||||||
|
|
||||||
|
|
||||||
async def _entrypoint(conn, user_agent: str, sites: List[Site]):
|
async def _entrypoint(
|
||||||
|
conn, detector: LanguageDetector, user_agent: str, sites: List[Site]
|
||||||
|
):
|
||||||
"""Top-level entrypoint that dispatches a pipeline per requested site."""
|
"""Top-level entrypoint that dispatches a pipeline per requested site."""
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(
|
||||||
headers={"User-Agent": f"BoardWise coach-scraper ({user_agent})"}
|
headers={"User-Agent": f"BoardWise coach-scraper ({user_agent})"}
|
||||||
) as session:
|
) as session:
|
||||||
await asyncio.gather(*[_process(site, conn, session) for site in sites])
|
await asyncio.gather(
|
||||||
|
*[_process(site, conn, detector, session) for site in sites]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -58,6 +69,8 @@ def main():
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
detector = LanguageDetectorBuilder.from_all_languages().build()
|
||||||
|
|
||||||
conn = None
|
conn = None
|
||||||
try:
|
try:
|
||||||
conn = psycopg2.connect(
|
conn = psycopg2.connect(
|
||||||
|
@ -72,6 +85,7 @@ def main():
|
||||||
asyncio.run(
|
asyncio.run(
|
||||||
_entrypoint(
|
_entrypoint(
|
||||||
conn=conn,
|
conn=conn,
|
||||||
|
detector=detector,
|
||||||
user_agent=args.user_agent,
|
user_agent=args.user_agent,
|
||||||
sites=list(map(Site, set(args.site))),
|
sites=list(map(Site, set(args.site))),
|
||||||
)
|
)
|
||||||
|
|
|
@ -6,7 +6,9 @@ from typing import List
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||||
|
from lingua import LanguageDetector
|
||||||
|
|
||||||
|
from app.locale import Locale, lang_to_locale
|
||||||
from app.pipeline import Extractor as BaseExtractor
|
from app.pipeline import Extractor as BaseExtractor
|
||||||
from app.pipeline import Fetcher as BaseFetcher
|
from app.pipeline import Fetcher as BaseFetcher
|
||||||
from app.pipeline import Pipeline as BasePipeline
|
from app.pipeline import Pipeline as BasePipeline
|
||||||
|
@ -97,16 +99,19 @@ class Fetcher(BaseFetcher):
|
||||||
|
|
||||||
|
|
||||||
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||||
if "profile-header-info" in attrs.get("class", ""):
|
for className in [
|
||||||
return True
|
"profile-header-info",
|
||||||
if "profile-card-info" in attrs.get("class", ""):
|
"profile-card-info",
|
||||||
|
"profile-about",
|
||||||
|
]:
|
||||||
|
if className in attrs.get("class", ""):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
class Extractor(BaseExtractor):
|
class Extractor(BaseExtractor):
|
||||||
def __init__(self, fetcher: BaseFetcher, username: str):
|
def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
|
||||||
super().__init__(fetcher, username)
|
super().__init__(fetcher, detector, username)
|
||||||
|
|
||||||
self.profile_soup = None
|
self.profile_soup = None
|
||||||
try:
|
try:
|
||||||
|
@ -164,9 +169,19 @@ class Extractor(BaseExtractor):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_languages(self) -> List[str] | None:
|
def get_languages(self) -> List[Locale] | None:
|
||||||
# TODO: Extract using huggingface model.
|
if self.profile_soup is None:
|
||||||
return None
|
return None
|
||||||
|
about = self.profile_soup.find("div", class_="profile-about")
|
||||||
|
if not isinstance(about, Tag):
|
||||||
|
return None
|
||||||
|
detected = self.detector.detect_language_of(about.text)
|
||||||
|
if detected is None:
|
||||||
|
return None
|
||||||
|
code = lang_to_locale.get(detected)
|
||||||
|
if code is None:
|
||||||
|
return None
|
||||||
|
return [code]
|
||||||
|
|
||||||
def get_rapid(self) -> int | None:
|
def get_rapid(self) -> int | None:
|
||||||
return self.stats_json.get("rapid", {}).get("rating")
|
return self.stats_json.get("rapid", {}).get("rating")
|
||||||
|
@ -182,5 +197,7 @@ class Pipeline(BasePipeline):
|
||||||
def get_fetcher(self, session: aiohttp.ClientSession):
|
def get_fetcher(self, session: aiohttp.ClientSession):
|
||||||
return Fetcher(session)
|
return Fetcher(session)
|
||||||
|
|
||||||
def get_extractor(self, fetcher: BaseFetcher, username: str):
|
def get_extractor(
|
||||||
return Extractor(fetcher, username)
|
self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
|
||||||
|
):
|
||||||
|
return Extractor(fetcher, detector, username)
|
||||||
|
|
|
@ -4,7 +4,8 @@ from typing import List, Literal
|
||||||
|
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
from app.types import Site, Title, code_to_lang
|
from app.locale import Locale, locale_to_str, native_to_locale
|
||||||
|
from app.types import Site, Title
|
||||||
|
|
||||||
SCHEMA_NAME = "coach_scraper"
|
SCHEMA_NAME = "coach_scraper"
|
||||||
MAIN_TABLE_NAME = "export"
|
MAIN_TABLE_NAME = "export"
|
||||||
|
@ -41,7 +42,7 @@ class Row(TypedDict, total=False):
|
||||||
# The FIDE title assigned to the coach on the source siste.
|
# The FIDE title assigned to the coach on the source siste.
|
||||||
title: Title
|
title: Title
|
||||||
# The list of languages the coach is fluent in.
|
# The list of languages the coach is fluent in.
|
||||||
languages: List[str]
|
languages: List[Locale]
|
||||||
# Rapid rating relative to the site they were sourced from.
|
# Rapid rating relative to the site they were sourced from.
|
||||||
rapid: int
|
rapid: int
|
||||||
# Blitz rating relative to the site they were sourced from.
|
# Blitz rating relative to the site they were sourced from.
|
||||||
|
@ -55,7 +56,7 @@ def load_languages(conn):
|
||||||
cursor = None
|
cursor = None
|
||||||
try:
|
try:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
for pos, (code, name) in enumerate(list(code_to_lang.items())):
|
for pos, (name, loc) in enumerate(list(native_to_locale.items())):
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
f"""
|
f"""
|
||||||
INSERT INTO {SCHEMA_NAME}.{LANG_TABLE_NAME}
|
INSERT INTO {SCHEMA_NAME}.{LANG_TABLE_NAME}
|
||||||
|
@ -67,7 +68,7 @@ def load_languages(conn):
|
||||||
DO UPDATE SET
|
DO UPDATE SET
|
||||||
name = EXCLUDED.name;
|
name = EXCLUDED.name;
|
||||||
""",
|
""",
|
||||||
[code, name, pos],
|
[locale_to_str(loc), name, pos],
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
finally:
|
finally:
|
||||||
|
@ -157,7 +158,7 @@ def upsert_row(conn, row: Row):
|
||||||
row.get("name"),
|
row.get("name"),
|
||||||
row.get("image_url"),
|
row.get("image_url"),
|
||||||
row["title"].value if "title" in row else None,
|
row["title"].value if "title" in row else None,
|
||||||
row.get("languages", []),
|
list(map(locale_to_str, row.get("languages", []))),
|
||||||
row.get("rapid"),
|
row.get("rapid"),
|
||||||
row.get("blitz"),
|
row.get("blitz"),
|
||||||
row.get("bullet"),
|
row.get("bullet"),
|
||||||
|
|
|
@ -5,11 +5,13 @@ from typing import List
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
from bs4 import BeautifulSoup, SoupStrainer, Tag
|
||||||
|
from lingua import LanguageDetector
|
||||||
|
|
||||||
|
from app.locale import Locale, native_to_locale
|
||||||
from app.pipeline import Extractor as BaseExtractor
|
from app.pipeline import Extractor as BaseExtractor
|
||||||
from app.pipeline import Fetcher as BaseFetcher
|
from app.pipeline import Fetcher as BaseFetcher
|
||||||
from app.pipeline import Pipeline as BasePipeline
|
from app.pipeline import Pipeline as BasePipeline
|
||||||
from app.types import Site, Title, lang_to_code
|
from app.types import Site, Title
|
||||||
|
|
||||||
# The number of pages we will at most iterate through. This number was
|
# The number of pages we will at most iterate through. This number was
|
||||||
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||||
|
@ -113,8 +115,8 @@ def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
|
||||||
|
|
||||||
|
|
||||||
class Extractor(BaseExtractor):
|
class Extractor(BaseExtractor):
|
||||||
def __init__(self, fetcher: BaseFetcher, username: str):
|
def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
|
||||||
super().__init__(fetcher, username)
|
super().__init__(fetcher, detector, username)
|
||||||
|
|
||||||
self.profile_soup = None
|
self.profile_soup = None
|
||||||
try:
|
try:
|
||||||
|
@ -175,7 +177,7 @@ class Extractor(BaseExtractor):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_languages(self) -> List[str] | None:
|
def get_languages(self) -> List[Locale] | None:
|
||||||
if self.profile_soup is None:
|
if self.profile_soup is None:
|
||||||
return None
|
return None
|
||||||
tr = self.profile_soup.find("tr", class_="languages")
|
tr = self.profile_soup.find("tr", class_="languages")
|
||||||
|
@ -187,8 +189,8 @@ class Extractor(BaseExtractor):
|
||||||
|
|
||||||
codes = []
|
codes = []
|
||||||
for lang in [s.strip() for s in tr.get_text().split(",")]:
|
for lang in [s.strip() for s in tr.get_text().split(",")]:
|
||||||
if lang in lang_to_code:
|
if lang in native_to_locale:
|
||||||
codes.append(lang_to_code[lang])
|
codes.append(native_to_locale[lang])
|
||||||
return codes
|
return codes
|
||||||
|
|
||||||
def get_rapid(self) -> int | None:
|
def get_rapid(self) -> int | None:
|
||||||
|
@ -225,5 +227,7 @@ class Pipeline(BasePipeline):
|
||||||
def get_fetcher(self, session: aiohttp.ClientSession):
|
def get_fetcher(self, session: aiohttp.ClientSession):
|
||||||
return Fetcher(session)
|
return Fetcher(session)
|
||||||
|
|
||||||
def get_extractor(self, fetcher: BaseFetcher, username: str):
|
def get_extractor(
|
||||||
return Extractor(fetcher, username)
|
self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
|
||||||
|
):
|
||||||
|
return Extractor(fetcher, detector, username)
|
||||||
|
|
|
@ -0,0 +1,154 @@
|
||||||
|
import enum
|
||||||
|
from collections import OrderedDict
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from lingua import Language
|
||||||
|
|
||||||
|
|
||||||
|
class Locale(enum.Enum):
|
||||||
|
"""Maps {language}_{country} to the name of the langage in said language."""
|
||||||
|
|
||||||
|
en_GB = "English"
|
||||||
|
af_ZA = "Afrikaans"
|
||||||
|
an_ES = "Aragonés"
|
||||||
|
ar_SA = "العربية"
|
||||||
|
as_IN = "অসমীয়া"
|
||||||
|
av_DA = "авар мацӀ"
|
||||||
|
az_AZ = "Azərbaycanca"
|
||||||
|
be_BY = "Беларуская"
|
||||||
|
bg_BG = "български език"
|
||||||
|
bn_BD = "বাংলা"
|
||||||
|
br_FR = "Brezhoneg"
|
||||||
|
bs_BA = "Bosanski"
|
||||||
|
ca_ES = "Català, valencià"
|
||||||
|
ckb_IR = "کوردی سۆرانی"
|
||||||
|
co_FR = "Corsu"
|
||||||
|
cs_CZ = "Čeština"
|
||||||
|
cv_CU = "чӑваш чӗлхи"
|
||||||
|
cy_GB = "Cymraeg"
|
||||||
|
da_DK = "Dansk"
|
||||||
|
de_DE = "Deutsch"
|
||||||
|
el_GR = "Ελληνικά"
|
||||||
|
en_US = "English (US)"
|
||||||
|
eo_UY = "Esperanto"
|
||||||
|
es_ES = "Español"
|
||||||
|
et_EE = "Eesti keel"
|
||||||
|
eu_ES = "Euskara"
|
||||||
|
fa_IR = "فارسی"
|
||||||
|
fi_FI = "Suomen kieli"
|
||||||
|
fo_FO = "Føroyskt"
|
||||||
|
fr_FR = "Français"
|
||||||
|
frp_IT = "Arpitan"
|
||||||
|
fy_NL = "Frysk"
|
||||||
|
ga_IE = "Gaeilge"
|
||||||
|
gd_GB = "Gàidhlig"
|
||||||
|
gl_ES = "Galego"
|
||||||
|
gsw_CH = "Schwizerdütsch"
|
||||||
|
gu_IN = "ગુજરાતી"
|
||||||
|
he_IL = "עִבְרִית"
|
||||||
|
hi_IN = "हिन्दी, हिंदी"
|
||||||
|
hr_HR = "Hrvatski"
|
||||||
|
hu_HU = "Magyar"
|
||||||
|
hy_AM = "Հայերեն"
|
||||||
|
ia_IA = "Interlingua"
|
||||||
|
id_ID = "Bahasa Indonesia"
|
||||||
|
io_EN = "Ido"
|
||||||
|
is_IS = "Íslenska"
|
||||||
|
it_IT = "Italiano"
|
||||||
|
ja_JP = "日本語"
|
||||||
|
jbo_EN = "Lojban"
|
||||||
|
jv_ID = "Basa Jawa"
|
||||||
|
ka_GE = "ქართული"
|
||||||
|
kab_DZ = "Taqvaylit"
|
||||||
|
kk_KZ = "қазақша"
|
||||||
|
kmr_TR = "Kurdî (Kurmancî)"
|
||||||
|
kn_IN = "ಕನ್ನಡ"
|
||||||
|
ko_KR = "한국어"
|
||||||
|
ky_KG = "кыргызча"
|
||||||
|
la_LA = "Lingua Latina"
|
||||||
|
lb_LU = "Lëtzebuergesch"
|
||||||
|
lt_LT = "Lietuvių kalba"
|
||||||
|
lv_LV = "Latviešu valoda"
|
||||||
|
mg_MG = "Fiteny malagasy"
|
||||||
|
mk_MK = "македонски јази"
|
||||||
|
ml_IN = "മലയാളം"
|
||||||
|
mn_MN = "монгол"
|
||||||
|
mr_IN = "मराठी"
|
||||||
|
ms_MY = "Melayu"
|
||||||
|
nb_NO = "Norsk bokmål"
|
||||||
|
ne_NP = "नेपाली"
|
||||||
|
nl_NL = "Nederlands"
|
||||||
|
nn_NO = "Norsk nynorsk"
|
||||||
|
pi_IN = "पालि"
|
||||||
|
pl_PL = "Polski"
|
||||||
|
ps_AF = "پښتو"
|
||||||
|
pt_PT = "Português"
|
||||||
|
pt_BR = "Português (BR)"
|
||||||
|
ro_RO = "Română"
|
||||||
|
ru_RU = "русский язык"
|
||||||
|
ry_UA = "Русинська бисїда"
|
||||||
|
sa_IN = "संस्कृत"
|
||||||
|
sk_SK = "Slovenčina"
|
||||||
|
sl_SI = "Slovenščina"
|
||||||
|
sq_AL = "Shqip"
|
||||||
|
sr_SP = "Српски језик"
|
||||||
|
sv_SE = "Svenska"
|
||||||
|
sw_KE = "Kiswahili"
|
||||||
|
ta_IN = "தமிழ்"
|
||||||
|
tg_TJ = "тоҷикӣ"
|
||||||
|
th_TH = "ไทย"
|
||||||
|
tk_TM = "Türkmençe"
|
||||||
|
tl_PH = "Tagalog"
|
||||||
|
tp_TP = "Toki pona"
|
||||||
|
tr_TR = "Türkçe"
|
||||||
|
uk_UA = "українська"
|
||||||
|
ur_PK = "اُردُو"
|
||||||
|
uz_UZ = "oʻzbekcha"
|
||||||
|
vi_VN = "Tiếng Việt"
|
||||||
|
yo_NG = "Yorùbá"
|
||||||
|
zh_CN = "中文"
|
||||||
|
zh_TW = "繁體中文"
|
||||||
|
zu_ZA = "isiZulu"
|
||||||
|
|
||||||
|
|
||||||
|
def locale_to_str(loc: Locale) -> str:
|
||||||
|
return loc.name.replace("_", "-")
|
||||||
|
|
||||||
|
|
||||||
|
# Uses the name of the language (in said language) as the key.
|
||||||
|
native_to_locale: OrderedDict[str, Locale] = OrderedDict(
|
||||||
|
[(loc.value, loc) for loc in Locale]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Uses an inferred/detected language as the key. Mapping was manually created
|
||||||
|
# using https://github.com/pemistahl/lingua-rs/blob/main/src/isocode.rs#L40 as
|
||||||
|
# a reference.
|
||||||
|
lang_to_locale: Dict[Language, Locale] = {
|
||||||
|
Language.CHINESE: Locale.zh_CN,
|
||||||
|
Language.CROATIAN: Locale.hr_HR,
|
||||||
|
Language.DANISH: Locale.da_DK,
|
||||||
|
Language.DUTCH: Locale.nl_NL,
|
||||||
|
Language.ENGLISH: Locale.en_GB,
|
||||||
|
Language.FINNISH: Locale.fi_FI,
|
||||||
|
Language.FRENCH: Locale.fr_FR,
|
||||||
|
Language.GERMAN: Locale.de_DE,
|
||||||
|
Language.HUNGARIAN: Locale.hu_HU,
|
||||||
|
Language.ITALIAN: Locale.it_IT,
|
||||||
|
Language.KOREAN: Locale.ko_KR,
|
||||||
|
Language.LATIN: Locale.la_LA,
|
||||||
|
Language.MALAY: Locale.ms_MY,
|
||||||
|
Language.PERSIAN: Locale.fa_IR,
|
||||||
|
Language.POLISH: Locale.pl_PL,
|
||||||
|
Language.PORTUGUESE: Locale.pt_PT,
|
||||||
|
Language.ROMANIAN: Locale.ro_RO,
|
||||||
|
Language.RUSSIAN: Locale.ru_RU,
|
||||||
|
Language.SLOVENE: Locale.sl_SI,
|
||||||
|
Language.SPANISH: Locale.es_ES,
|
||||||
|
Language.SWAHILI: Locale.sw_KE,
|
||||||
|
Language.SWEDISH: Locale.sv_SE,
|
||||||
|
Language.TAGALOG: Locale.tl_PH,
|
||||||
|
Language.TURKISH: Locale.tr_TR,
|
||||||
|
Language.UKRAINIAN: Locale.uk_UA,
|
||||||
|
Language.VIETNAMESE: Locale.vi_VN,
|
||||||
|
Language.YORUBA: Locale.yo_NG,
|
||||||
|
}
|
|
@ -3,8 +3,10 @@ import os.path
|
||||||
from typing import Any, List, Tuple
|
from typing import Any, List, Tuple
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from lingua import LanguageDetector
|
||||||
|
|
||||||
from app.database import Row, RowKey, upsert_row
|
from app.database import Row, RowKey, upsert_row
|
||||||
|
from app.locale import Locale
|
||||||
from app.types import Site, Title
|
from app.types import Site, Title
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,8 +96,9 @@ def _insert(row: Row, key: RowKey, value: Any):
|
||||||
|
|
||||||
|
|
||||||
class Extractor:
|
class Extractor:
|
||||||
def __init__(self, fetcher: Fetcher, username: str):
|
def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str):
|
||||||
self.fetcher = fetcher
|
self.fetcher = fetcher
|
||||||
|
self.detector = detector
|
||||||
self.username = username
|
self.username = username
|
||||||
|
|
||||||
def get_name(self) -> str | None:
|
def get_name(self) -> str | None:
|
||||||
|
@ -107,7 +110,7 @@ class Extractor:
|
||||||
def get_title(self) -> Title | None:
|
def get_title(self) -> Title | None:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_languages(self) -> List[str] | None:
|
def get_languages(self) -> List[Locale] | None:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_rapid(self) -> int | None:
|
def get_rapid(self) -> int | None:
|
||||||
|
@ -157,10 +160,14 @@ class Pipeline:
|
||||||
def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
|
def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_extractor(self, fetcher: Fetcher, username: str) -> Extractor:
|
def get_extractor(
|
||||||
|
self, fetcher: Fetcher, detector: LanguageDetector, username: str
|
||||||
|
) -> Extractor:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
async def process(self, conn, session: aiohttp.ClientSession):
|
async def process(
|
||||||
|
self, conn, detector: LanguageDetector, session: aiohttp.ClientSession
|
||||||
|
):
|
||||||
fetcher = self.get_fetcher(session)
|
fetcher = self.get_fetcher(session)
|
||||||
|
|
||||||
queue: asyncio.Queue = asyncio.Queue()
|
queue: asyncio.Queue = asyncio.Queue()
|
||||||
|
@ -180,7 +187,7 @@ class Pipeline:
|
||||||
page_no += 1
|
page_no += 1
|
||||||
for username in usernames or []:
|
for username in usernames or []:
|
||||||
await fetcher._download_user_files(username)
|
await fetcher._download_user_files(username)
|
||||||
extractor = self.get_extractor(fetcher, username)
|
extractor = self.get_extractor(fetcher, detector, username)
|
||||||
queue.put_nowait((conn, extractor))
|
queue.put_nowait((conn, extractor))
|
||||||
|
|
||||||
# Wait until the queue is fully processed.
|
# Wait until the queue is fully processed.
|
||||||
|
|
113
app/types.py
113
app/types.py
|
@ -1,5 +1,4 @@
|
||||||
import enum
|
import enum
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
|
|
||||||
class Site(enum.Enum):
|
class Site(enum.Enum):
|
||||||
|
@ -18,115 +17,3 @@ class Title(enum.Enum):
|
||||||
WFM = "WFM"
|
WFM = "WFM"
|
||||||
WCM = "WCM"
|
WCM = "WCM"
|
||||||
WNM = "WNM"
|
WNM = "WNM"
|
||||||
|
|
||||||
|
|
||||||
class Language(enum.Enum):
|
|
||||||
en_GB = "English"
|
|
||||||
af_ZA = "Afrikaans"
|
|
||||||
an_ES = "Aragonés"
|
|
||||||
ar_SA = "العربية"
|
|
||||||
as_IN = "অসমীয়া"
|
|
||||||
av_DA = "авар мацӀ"
|
|
||||||
az_AZ = "Azərbaycanca"
|
|
||||||
be_BY = "Беларуская"
|
|
||||||
bg_BG = "български език"
|
|
||||||
bn_BD = "বাংলা"
|
|
||||||
br_FR = "Brezhoneg"
|
|
||||||
bs_BA = "Bosanski"
|
|
||||||
ca_ES = "Català, valencià"
|
|
||||||
ckb_IR = "کوردی سۆرانی"
|
|
||||||
co_FR = "Corsu"
|
|
||||||
cs_CZ = "Čeština"
|
|
||||||
cv_CU = "чӑваш чӗлхи"
|
|
||||||
cy_GB = "Cymraeg"
|
|
||||||
da_DK = "Dansk"
|
|
||||||
de_DE = "Deutsch"
|
|
||||||
el_GR = "Ελληνικά"
|
|
||||||
en_US = "English (US)"
|
|
||||||
eo_UY = "Esperanto"
|
|
||||||
es_ES = "Español"
|
|
||||||
et_EE = "Eesti keel"
|
|
||||||
eu_ES = "Euskara"
|
|
||||||
fa_IR = "فارسی"
|
|
||||||
fi_FI = "Suomen kieli"
|
|
||||||
fo_FO = "Føroyskt"
|
|
||||||
fr_FR = "Français"
|
|
||||||
frp_IT = "Arpitan"
|
|
||||||
fy_NL = "Frysk"
|
|
||||||
ga_IE = "Gaeilge"
|
|
||||||
gd_GB = "Gàidhlig"
|
|
||||||
gl_ES = "Galego"
|
|
||||||
gsw_CH = "Schwizerdütsch"
|
|
||||||
gu_IN = "ગુજરાતી"
|
|
||||||
he_IL = "עִבְרִית"
|
|
||||||
hi_IN = "हिन्दी, हिंदी"
|
|
||||||
hr_HR = "Hrvatski"
|
|
||||||
hu_HU = "Magyar"
|
|
||||||
hy_AM = "Հայերեն"
|
|
||||||
ia_IA = "Interlingua"
|
|
||||||
id_ID = "Bahasa Indonesia"
|
|
||||||
io_EN = "Ido"
|
|
||||||
is_IS = "Íslenska"
|
|
||||||
it_IT = "Italiano"
|
|
||||||
ja_JP = "日本語"
|
|
||||||
jbo_EN = "Lojban"
|
|
||||||
jv_ID = "Basa Jawa"
|
|
||||||
ka_GE = "ქართული"
|
|
||||||
kab_DZ = "Taqvaylit"
|
|
||||||
kk_KZ = "қазақша"
|
|
||||||
kmr_TR = "Kurdî (Kurmancî)"
|
|
||||||
kn_IN = "ಕನ್ನಡ"
|
|
||||||
ko_KR = "한국어"
|
|
||||||
ky_KG = "кыргызча"
|
|
||||||
la_LA = "Lingua Latina"
|
|
||||||
lb_LU = "Lëtzebuergesch"
|
|
||||||
lt_LT = "Lietuvių kalba"
|
|
||||||
lv_LV = "Latviešu valoda"
|
|
||||||
mg_MG = "Fiteny malagasy"
|
|
||||||
mk_MK = "македонски јази"
|
|
||||||
ml_IN = "മലയാളം"
|
|
||||||
mn_MN = "монгол"
|
|
||||||
mr_IN = "मराठी"
|
|
||||||
nb_NO = "Norsk bokmål"
|
|
||||||
ne_NP = "नेपाली"
|
|
||||||
nl_NL = "Nederlands"
|
|
||||||
nn_NO = "Norsk nynorsk"
|
|
||||||
pi_IN = "पालि"
|
|
||||||
pl_PL = "Polski"
|
|
||||||
ps_AF = "پښتو"
|
|
||||||
pt_PT = "Português"
|
|
||||||
pt_BR = "Português (BR)"
|
|
||||||
ro_RO = "Română"
|
|
||||||
ru_RU = "русский язык"
|
|
||||||
ry_UA = "Русинська бисїда"
|
|
||||||
sa_IN = "संस्कृत"
|
|
||||||
sk_SK = "Slovenčina"
|
|
||||||
sl_SI = "Slovenščina"
|
|
||||||
sq_AL = "Shqip"
|
|
||||||
sr_SP = "Српски језик"
|
|
||||||
sv_SE = "Svenska"
|
|
||||||
sw_KE = "Kiswahili"
|
|
||||||
ta_IN = "தமிழ்"
|
|
||||||
tg_TJ = "тоҷикӣ"
|
|
||||||
th_TH = "ไทย"
|
|
||||||
tk_TM = "Türkmençe"
|
|
||||||
tl_PH = "Tagalog"
|
|
||||||
tp_TP = "Toki pona"
|
|
||||||
tr_TR = "Türkçe"
|
|
||||||
uk_UA = "українська"
|
|
||||||
ur_PK = "اُردُو"
|
|
||||||
uz_UZ = "oʻzbekcha"
|
|
||||||
vi_VN = "Tiếng Việt"
|
|
||||||
yo_NG = "Yorùbá"
|
|
||||||
zh_CN = "中文"
|
|
||||||
zh_TW = "繁體中文"
|
|
||||||
zu_ZA = "isiZulu"
|
|
||||||
|
|
||||||
|
|
||||||
code_to_lang = OrderedDict(
|
|
||||||
[(lang.name.replace("_", "-"), lang.value) for lang in Language]
|
|
||||||
)
|
|
||||||
|
|
||||||
lang_to_code = OrderedDict(
|
|
||||||
[(lang.value, lang.name.replace("_", "-")) for lang in Language]
|
|
||||||
)
|
|
||||||
|
|
|
@ -226,6 +226,68 @@ files = [
|
||||||
{file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
|
{file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lingua-language-detector"
|
||||||
|
version = "2.0.1"
|
||||||
|
description = "An accurate natural language detection library, suitable for short text and mixed-language text"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:6a8d473de427e7eff54a5eb68fd38d75053dfc7b59e256a7233cc7409435d8ce"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1b4d67769fe25d903dbc41a63a5d1707913ddd3e574b072c84001cef8472ead0"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bcb9c8d273637ca69fa242184c8525c4bc56075d4b174d4adc68d81b11b814be"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c42e66f3b20e97f4e4b442b3f2977b5aefb04968535fc0a78ccd4137db5ef34"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9563cc878cf9b45d78d9effe9b3b101cb3098b932f2921023e92f984bd5f5120"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-none-win32.whl", hash = "sha256:2cb95b8e8abb40703b0705321cd3394033812812bc18d559a9a26604b241a663"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp310-none-win_amd64.whl", hash = "sha256:b89d995ac9974b9190036585cbd0b70e6117a2d09f2150a898b332abd1a57636"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:06db94dbb5b492924a536dbf97c247b228a3fcb00fe5bef9ca83b6b1aa959ca8"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4d9dcf69a304819d5d2716943de980cccf140168e7d0243925bb98dd0c661600"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7f66ed5cd572d07a1a853017f41bfd94e84e3081cc39690188adfa97337b199f"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89c39a2a324d265c44f8ee7c3ffc499506d6307bb484ab1d9565f2d5857697b"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae66cae403e36134558e929f8ba0d076be45e968f9fc7bab63869b19b34ddac1"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-none-win32.whl", hash = "sha256:d92f1e0b30233dc1091cb28fe00e0dba8255be609b2337c0dab7f71a2f9b5086"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp311-none-win_amd64.whl", hash = "sha256:60c1d0136c242097c58874a74d55b26e0c98ed81e44724d6426411b4bf585566"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:5531fbc6881149ce905e511e278ac97ed0e92d64b91b99910204058abe057769"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6193a0eac1534593427548d03cbcd82bcac040b3344a2abe67654c15a023c196"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:32644b03f107b9cee4a0884096acc21f1900dbec7951ede7c154d19d2a1a6f04"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9890f5227a7a767253248dd64fb651e4405256d8376f39216a6ff6e2702a0ee"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d286e14105794300217513326e2d8e82911c5efe773c6a37991336fbd63f9e2"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-none-win32.whl", hash = "sha256:ec88318a16467fea3457208ff1052dbd72cc486f9f07caeb4325fa76cab9044c"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp312-none-win_amd64.whl", hash = "sha256:5fcd53f1a2dc84a00a79c56ca59e4580cfbbb829e5e56249835d31444cf1f9ea"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:8efa10f188c40e10016e4f0d2a34d743e1555ddf4bd47553f6dd420f673c0e78"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ca3f11f372658cda18b309998596de6ffceaf6461e376da9c2861ac9d8b7efa3"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5599752ded097132d2a59b2764287240e72e2e9859bb69f43b2957f12d69ac6f"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27c1e422e5355d6931f82a1da52f822048be68c5f74d8b997c7d9f9617002e6a"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e489cb648684c5bb5a0d6cab2f602fe0bda7e72921f327ba7350e30c60edaa43"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-none-win32.whl", hash = "sha256:5d9f7f4b47c5bde5ff85089bcc4625f2f1a17e7677ec15dadb272b6e4b42c274"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp38-none-win_amd64.whl", hash = "sha256:493908b45ff237c8c776d1d2b688b113999667b841f284bae862af5f7f526b4f"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:e0c269a6099e2a637e2b8a312870792be2bb047abc6e7646122e498a159cc0b4"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7fcd61ec1c771f452346bc57d756fd079a89e9134b93ef94f5f166e27cda6385"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:3728012ace5b8e799df27d1e9272396bc5829a609f1aa799109a22e07e1034c2"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0619216bed8746501f0c3c4294ebe1bd55a2d9d72083e68dc0d954f9af5ab12e"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde1651655cac14a3b9ea9c3319a700de66a1f73cd07d63e9e653fc84d02f11e"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-none-win32.whl", hash = "sha256:dc1cfcaea24d71d657cf75fb54b593d3db14cf3a19aef6cd1017d0982a407b4e"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-cp39-none-win_amd64.whl", hash = "sha256:71d0662458e025aae7a6cbb0cc4e8169ea75e5b615a85e3310964748449896dd"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:42907729b8a343e91fd3d6e61b6add116c513354b6a88072245a4a1d21a18fb9"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ed60cb9b9fead792ec97544a3888e7aeda2ae34503c6252a92df599e006d7253"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fa1a6ab7bec65695650e801e23648742e92651315cf1443e9002a1b88ea2ac41"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be1bf4faa1e645876c8dfc29b37f8f567451b48d43f45c916aba13b946d5069c"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c7e52f4e0ca47b787fb5e05560a9e3d7a6bc10488a35a31248f82647314957"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c91fedd566b07ac1dc6c091bde6d69dae5c12c90e3e3c1d74fe29f76852e775a"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:16f5fecb6eba86bc094a734ba4bd0603b5bcc580a70c07d659ac2aec14f018ac"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f21abb598efa1de408275e640a22e8b967a43a9fbb0f32302a206efabf3ca0bc"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8906262b3866ae4473b5d5f08703addf0b6b12bc9c9aefcf2e2d855c1496d47a"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc9a6d87e766bf6d2d014986fda13fb8aa6c5602811e856f5555dd8128bd4f2e"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:650a6df0c1c7c7d650c9872be1f8e4b6ba32ff363d8184f60ee80441cffad779"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:62d7e29b9e85289bdf80a02a6ef9fcc7b943a5d123eaafa313aad4cfe7b48767"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ee213e9c0c4a256c2bbe2c1f074c9ac122073045049b6a7f999b507da185dea3"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06787b555ad9ddae613c0850b2aff991ea3e87c1ef714166d9691f90d1ad366c"},
|
||||||
|
{file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d983db8f8f0afbf2b4a796e6c8790490868c024492e59ec8b2fca30599c84760"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
test = ["pytest (==7.4.3)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lxml"
|
name = "lxml"
|
||||||
version = "4.9.3"
|
version = "4.9.3"
|
||||||
|
@ -555,4 +617,4 @@ multidict = ">=4.0"
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"
|
content-hash = "d6e80d9fee5ef164d85b7b32db698755af31847470468e3c35d655d5f44f95b4"
|
||||||
|
|
|
@ -11,6 +11,7 @@ beautifulsoup4 = "^4.12.2"
|
||||||
aiohttp = "^3.8.6"
|
aiohttp = "^3.8.6"
|
||||||
lxml = "^4.9.3"
|
lxml = "^4.9.3"
|
||||||
psycopg2 = "^2.9.9"
|
psycopg2 = "^2.9.9"
|
||||||
|
lingua-language-detector = "^2.0.1"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
|
@ -20,5 +21,5 @@ build-backend = "poetry.core.masonry.api"
|
||||||
app = "app.__main__:main"
|
app = "app.__main__:main"
|
||||||
|
|
||||||
[[tool.mypy.overrides]]
|
[[tool.mypy.overrides]]
|
||||||
module = "aiohttp"
|
module = ["aiohttp", "lingua"]
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
|
|
Loading…
Reference in New Issue