From 44a18fc59c32096ae2df00bd99c2ee71c84d0a2f Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Wed, 6 Dec 2023 20:53:54 -0700 Subject: [PATCH] Add language detection for chesscom profiles. --- app/__main__.py | 24 ++++++-- app/chesscom.py | 39 ++++++++---- app/database.py | 11 ++-- app/lichess.py | 20 ++++--- app/locale.py | 154 ++++++++++++++++++++++++++++++++++++++++++++++++ app/pipeline.py | 17 ++++-- app/types.py | 113 ----------------------------------- poetry.lock | 64 +++++++++++++++++++- pyproject.toml | 3 +- 9 files changed, 296 insertions(+), 149 deletions(-) create mode 100644 app/locale.py diff --git a/app/__main__.py b/app/__main__.py index f3a37c9..3956f7e 100644 --- a/app/__main__.py +++ b/app/__main__.py @@ -4,6 +4,7 @@ from typing import List import aiohttp import psycopg2 +from lingua import LanguageDetector, LanguageDetectorBuilder from app.chesscom import Pipeline as ChesscomPipeline from app.database import backup_database, load_languages @@ -14,21 +15,31 @@ from app.types import Site WORKER_COUNT = 10 -async def _process(site: Site, conn, session: aiohttp.ClientSession): +async def _process( + site: Site, conn, detector: LanguageDetector, session: aiohttp.ClientSession +): if site == Site.CHESSCOM: - await ChesscomPipeline(worker_count=WORKER_COUNT).process(conn, session) + await ChesscomPipeline(worker_count=WORKER_COUNT).process( + conn, detector, session + ) elif site == Site.LICHESS: - await LichessPipeline(worker_count=WORKER_COUNT).process(conn, session) + await LichessPipeline(worker_count=WORKER_COUNT).process( + conn, detector, session + ) else: assert False, f"Encountered unknown site: {site}." -async def _entrypoint(conn, user_agent: str, sites: List[Site]): +async def _entrypoint( + conn, detector: LanguageDetector, user_agent: str, sites: List[Site] +): """Top-level entrypoint that dispatches a pipeline per requested site.""" async with aiohttp.ClientSession( headers={"User-Agent": f"BoardWise coach-scraper ({user_agent})"} ) as session: - await asyncio.gather(*[_process(site, conn, session) for site in sites]) + await asyncio.gather( + *[_process(site, conn, detector, session) for site in sites] + ) def main(): @@ -58,6 +69,8 @@ def main(): args = parser.parse_args() + detector = LanguageDetectorBuilder.from_all_languages().build() + conn = None try: conn = psycopg2.connect( @@ -72,6 +85,7 @@ def main(): asyncio.run( _entrypoint( conn=conn, + detector=detector, user_agent=args.user_agent, sites=list(map(Site, set(args.site))), ) diff --git a/app/chesscom.py b/app/chesscom.py index 2b71439..4fd6aed 100644 --- a/app/chesscom.py +++ b/app/chesscom.py @@ -6,7 +6,9 @@ from typing import List import aiohttp from bs4 import BeautifulSoup, SoupStrainer, Tag +from lingua import LanguageDetector +from app.locale import Locale, lang_to_locale from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline @@ -97,16 +99,19 @@ class Fetcher(BaseFetcher): def _profile_filter(elem: Tag | str | None, attrs={}) -> bool: - if "profile-header-info" in attrs.get("class", ""): - return True - if "profile-card-info" in attrs.get("class", ""): - return True + for className in [ + "profile-header-info", + "profile-card-info", + "profile-about", + ]: + if className in attrs.get("class", ""): + return True return False class Extractor(BaseExtractor): - def __init__(self, fetcher: BaseFetcher, username: str): - super().__init__(fetcher, username) + def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str): + super().__init__(fetcher, detector, username) self.profile_soup = None try: @@ -164,9 +169,19 @@ class Extractor(BaseExtractor): except ValueError: return None - def get_languages(self) -> List[str] | None: - # TODO: Extract using huggingface model. - return None + def get_languages(self) -> List[Locale] | None: + if self.profile_soup is None: + return None + about = self.profile_soup.find("div", class_="profile-about") + if not isinstance(about, Tag): + return None + detected = self.detector.detect_language_of(about.text) + if detected is None: + return None + code = lang_to_locale.get(detected) + if code is None: + return None + return [code] def get_rapid(self) -> int | None: return self.stats_json.get("rapid", {}).get("rating") @@ -182,5 +197,7 @@ class Pipeline(BasePipeline): def get_fetcher(self, session: aiohttp.ClientSession): return Fetcher(session) - def get_extractor(self, fetcher: BaseFetcher, username: str): - return Extractor(fetcher, username) + def get_extractor( + self, fetcher: BaseFetcher, detector: LanguageDetector, username: str + ): + return Extractor(fetcher, detector, username) diff --git a/app/database.py b/app/database.py index b3efcf0..20c6bc7 100644 --- a/app/database.py +++ b/app/database.py @@ -4,7 +4,8 @@ from typing import List, Literal from typing_extensions import TypedDict -from app.types import Site, Title, code_to_lang +from app.locale import Locale, locale_to_str, native_to_locale +from app.types import Site, Title SCHEMA_NAME = "coach_scraper" MAIN_TABLE_NAME = "export" @@ -41,7 +42,7 @@ class Row(TypedDict, total=False): # The FIDE title assigned to the coach on the source siste. title: Title # The list of languages the coach is fluent in. - languages: List[str] + languages: List[Locale] # Rapid rating relative to the site they were sourced from. rapid: int # Blitz rating relative to the site they were sourced from. @@ -55,7 +56,7 @@ def load_languages(conn): cursor = None try: cursor = conn.cursor() - for pos, (code, name) in enumerate(list(code_to_lang.items())): + for pos, (name, loc) in enumerate(list(native_to_locale.items())): cursor.execute( f""" INSERT INTO {SCHEMA_NAME}.{LANG_TABLE_NAME} @@ -67,7 +68,7 @@ def load_languages(conn): DO UPDATE SET name = EXCLUDED.name; """, - [code, name, pos], + [locale_to_str(loc), name, pos], ) conn.commit() finally: @@ -157,7 +158,7 @@ def upsert_row(conn, row: Row): row.get("name"), row.get("image_url"), row["title"].value if "title" in row else None, - row.get("languages", []), + list(map(locale_to_str, row.get("languages", []))), row.get("rapid"), row.get("blitz"), row.get("bullet"), diff --git a/app/lichess.py b/app/lichess.py index d151793..b4f7480 100644 --- a/app/lichess.py +++ b/app/lichess.py @@ -5,11 +5,13 @@ from typing import List import aiohttp from bs4 import BeautifulSoup, SoupStrainer, Tag +from lingua import LanguageDetector +from app.locale import Locale, native_to_locale from app.pipeline import Extractor as BaseExtractor from app.pipeline import Fetcher as BaseFetcher from app.pipeline import Pipeline as BasePipeline -from app.types import Site, Title, lang_to_code +from app.types import Site, Title # The number of pages we will at most iterate through. This number was # determined by going to https://lichess.org/coach/all/all/alphabetical @@ -113,8 +115,8 @@ def _stats_filter(elem: Tag | str | None, attrs={}) -> bool: class Extractor(BaseExtractor): - def __init__(self, fetcher: BaseFetcher, username: str): - super().__init__(fetcher, username) + def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str): + super().__init__(fetcher, detector, username) self.profile_soup = None try: @@ -175,7 +177,7 @@ class Extractor(BaseExtractor): except ValueError: return None - def get_languages(self) -> List[str] | None: + def get_languages(self) -> List[Locale] | None: if self.profile_soup is None: return None tr = self.profile_soup.find("tr", class_="languages") @@ -187,8 +189,8 @@ class Extractor(BaseExtractor): codes = [] for lang in [s.strip() for s in tr.get_text().split(",")]: - if lang in lang_to_code: - codes.append(lang_to_code[lang]) + if lang in native_to_locale: + codes.append(native_to_locale[lang]) return codes def get_rapid(self) -> int | None: @@ -225,5 +227,7 @@ class Pipeline(BasePipeline): def get_fetcher(self, session: aiohttp.ClientSession): return Fetcher(session) - def get_extractor(self, fetcher: BaseFetcher, username: str): - return Extractor(fetcher, username) + def get_extractor( + self, fetcher: BaseFetcher, detector: LanguageDetector, username: str + ): + return Extractor(fetcher, detector, username) diff --git a/app/locale.py b/app/locale.py new file mode 100644 index 0000000..7cf1a2a --- /dev/null +++ b/app/locale.py @@ -0,0 +1,154 @@ +import enum +from collections import OrderedDict +from typing import Dict + +from lingua import Language + + +class Locale(enum.Enum): + """Maps {language}_{country} to the name of the langage in said language.""" + + en_GB = "English" + af_ZA = "Afrikaans" + an_ES = "Aragonés" + ar_SA = "العربية" + as_IN = "অসমীয়া" + av_DA = "авар мацӀ" + az_AZ = "Azərbaycanca" + be_BY = "Беларуская" + bg_BG = "български език" + bn_BD = "বাংলা" + br_FR = "Brezhoneg" + bs_BA = "Bosanski" + ca_ES = "Català, valencià" + ckb_IR = "کوردی سۆرانی" + co_FR = "Corsu" + cs_CZ = "Čeština" + cv_CU = "чӑваш чӗлхи" + cy_GB = "Cymraeg" + da_DK = "Dansk" + de_DE = "Deutsch" + el_GR = "Ελληνικά" + en_US = "English (US)" + eo_UY = "Esperanto" + es_ES = "Español" + et_EE = "Eesti keel" + eu_ES = "Euskara" + fa_IR = "فارسی" + fi_FI = "Suomen kieli" + fo_FO = "Føroyskt" + fr_FR = "Français" + frp_IT = "Arpitan" + fy_NL = "Frysk" + ga_IE = "Gaeilge" + gd_GB = "Gàidhlig" + gl_ES = "Galego" + gsw_CH = "Schwizerdütsch" + gu_IN = "ગુજરાતી" + he_IL = "עִבְרִית" + hi_IN = "हिन्दी, हिंदी" + hr_HR = "Hrvatski" + hu_HU = "Magyar" + hy_AM = "Հայերեն" + ia_IA = "Interlingua" + id_ID = "Bahasa Indonesia" + io_EN = "Ido" + is_IS = "Íslenska" + it_IT = "Italiano" + ja_JP = "日本語" + jbo_EN = "Lojban" + jv_ID = "Basa Jawa" + ka_GE = "ქართული" + kab_DZ = "Taqvaylit" + kk_KZ = "қазақша" + kmr_TR = "Kurdî (Kurmancî)" + kn_IN = "ಕನ್ನಡ" + ko_KR = "한국어" + ky_KG = "кыргызча" + la_LA = "Lingua Latina" + lb_LU = "Lëtzebuergesch" + lt_LT = "Lietuvių kalba" + lv_LV = "Latviešu valoda" + mg_MG = "Fiteny malagasy" + mk_MK = "македонски јази" + ml_IN = "മലയാളം" + mn_MN = "монгол" + mr_IN = "मराठी" + ms_MY = "Melayu" + nb_NO = "Norsk bokmål" + ne_NP = "नेपाली" + nl_NL = "Nederlands" + nn_NO = "Norsk nynorsk" + pi_IN = "पालि" + pl_PL = "Polski" + ps_AF = "پښتو" + pt_PT = "Português" + pt_BR = "Português (BR)" + ro_RO = "Română" + ru_RU = "русский язык" + ry_UA = "Русинська бисїда" + sa_IN = "संस्कृत" + sk_SK = "Slovenčina" + sl_SI = "Slovenščina" + sq_AL = "Shqip" + sr_SP = "Српски језик" + sv_SE = "Svenska" + sw_KE = "Kiswahili" + ta_IN = "தமிழ்" + tg_TJ = "тоҷикӣ" + th_TH = "ไทย" + tk_TM = "Türkmençe" + tl_PH = "Tagalog" + tp_TP = "Toki pona" + tr_TR = "Türkçe" + uk_UA = "українська" + ur_PK = "اُردُو" + uz_UZ = "oʻzbekcha" + vi_VN = "Tiếng Việt" + yo_NG = "Yorùbá" + zh_CN = "中文" + zh_TW = "繁體中文" + zu_ZA = "isiZulu" + + +def locale_to_str(loc: Locale) -> str: + return loc.name.replace("_", "-") + + +# Uses the name of the language (in said language) as the key. +native_to_locale: OrderedDict[str, Locale] = OrderedDict( + [(loc.value, loc) for loc in Locale] +) + +# Uses an inferred/detected language as the key. Mapping was manually created +# using https://github.com/pemistahl/lingua-rs/blob/main/src/isocode.rs#L40 as +# a reference. +lang_to_locale: Dict[Language, Locale] = { + Language.CHINESE: Locale.zh_CN, + Language.CROATIAN: Locale.hr_HR, + Language.DANISH: Locale.da_DK, + Language.DUTCH: Locale.nl_NL, + Language.ENGLISH: Locale.en_GB, + Language.FINNISH: Locale.fi_FI, + Language.FRENCH: Locale.fr_FR, + Language.GERMAN: Locale.de_DE, + Language.HUNGARIAN: Locale.hu_HU, + Language.ITALIAN: Locale.it_IT, + Language.KOREAN: Locale.ko_KR, + Language.LATIN: Locale.la_LA, + Language.MALAY: Locale.ms_MY, + Language.PERSIAN: Locale.fa_IR, + Language.POLISH: Locale.pl_PL, + Language.PORTUGUESE: Locale.pt_PT, + Language.ROMANIAN: Locale.ro_RO, + Language.RUSSIAN: Locale.ru_RU, + Language.SLOVENE: Locale.sl_SI, + Language.SPANISH: Locale.es_ES, + Language.SWAHILI: Locale.sw_KE, + Language.SWEDISH: Locale.sv_SE, + Language.TAGALOG: Locale.tl_PH, + Language.TURKISH: Locale.tr_TR, + Language.UKRAINIAN: Locale.uk_UA, + Language.VIETNAMESE: Locale.vi_VN, + Language.YORUBA: Locale.yo_NG, +} diff --git a/app/pipeline.py b/app/pipeline.py index 96ba96f..7ea9cce 100644 --- a/app/pipeline.py +++ b/app/pipeline.py @@ -3,8 +3,10 @@ import os.path from typing import Any, List, Tuple import aiohttp +from lingua import LanguageDetector from app.database import Row, RowKey, upsert_row +from app.locale import Locale from app.types import Site, Title @@ -94,8 +96,9 @@ def _insert(row: Row, key: RowKey, value: Any): class Extractor: - def __init__(self, fetcher: Fetcher, username: str): + def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str): self.fetcher = fetcher + self.detector = detector self.username = username def get_name(self) -> str | None: @@ -107,7 +110,7 @@ class Extractor: def get_title(self) -> Title | None: raise NotImplementedError() - def get_languages(self) -> List[str] | None: + def get_languages(self) -> List[Locale] | None: raise NotImplementedError() def get_rapid(self) -> int | None: @@ -157,10 +160,14 @@ class Pipeline: def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher: raise NotImplementedError() - def get_extractor(self, fetcher: Fetcher, username: str) -> Extractor: + def get_extractor( + self, fetcher: Fetcher, detector: LanguageDetector, username: str + ) -> Extractor: raise NotImplementedError() - async def process(self, conn, session: aiohttp.ClientSession): + async def process( + self, conn, detector: LanguageDetector, session: aiohttp.ClientSession + ): fetcher = self.get_fetcher(session) queue: asyncio.Queue = asyncio.Queue() @@ -180,7 +187,7 @@ class Pipeline: page_no += 1 for username in usernames or []: await fetcher._download_user_files(username) - extractor = self.get_extractor(fetcher, username) + extractor = self.get_extractor(fetcher, detector, username) queue.put_nowait((conn, extractor)) # Wait until the queue is fully processed. diff --git a/app/types.py b/app/types.py index a53797b..e9410bd 100644 --- a/app/types.py +++ b/app/types.py @@ -1,5 +1,4 @@ import enum -from collections import OrderedDict class Site(enum.Enum): @@ -18,115 +17,3 @@ class Title(enum.Enum): WFM = "WFM" WCM = "WCM" WNM = "WNM" - - -class Language(enum.Enum): - en_GB = "English" - af_ZA = "Afrikaans" - an_ES = "Aragonés" - ar_SA = "العربية" - as_IN = "অসমীয়া" - av_DA = "авар мацӀ" - az_AZ = "Azərbaycanca" - be_BY = "Беларуская" - bg_BG = "български език" - bn_BD = "বাংলা" - br_FR = "Brezhoneg" - bs_BA = "Bosanski" - ca_ES = "Català, valencià" - ckb_IR = "کوردی سۆرانی" - co_FR = "Corsu" - cs_CZ = "Čeština" - cv_CU = "чӑваш чӗлхи" - cy_GB = "Cymraeg" - da_DK = "Dansk" - de_DE = "Deutsch" - el_GR = "Ελληνικά" - en_US = "English (US)" - eo_UY = "Esperanto" - es_ES = "Español" - et_EE = "Eesti keel" - eu_ES = "Euskara" - fa_IR = "فارسی" - fi_FI = "Suomen kieli" - fo_FO = "Føroyskt" - fr_FR = "Français" - frp_IT = "Arpitan" - fy_NL = "Frysk" - ga_IE = "Gaeilge" - gd_GB = "Gàidhlig" - gl_ES = "Galego" - gsw_CH = "Schwizerdütsch" - gu_IN = "ગુજરાતી" - he_IL = "עִבְרִית" - hi_IN = "हिन्दी, हिंदी" - hr_HR = "Hrvatski" - hu_HU = "Magyar" - hy_AM = "Հայերեն" - ia_IA = "Interlingua" - id_ID = "Bahasa Indonesia" - io_EN = "Ido" - is_IS = "Íslenska" - it_IT = "Italiano" - ja_JP = "日本語" - jbo_EN = "Lojban" - jv_ID = "Basa Jawa" - ka_GE = "ქართული" - kab_DZ = "Taqvaylit" - kk_KZ = "қазақша" - kmr_TR = "Kurdî (Kurmancî)" - kn_IN = "ಕನ್ನಡ" - ko_KR = "한국어" - ky_KG = "кыргызча" - la_LA = "Lingua Latina" - lb_LU = "Lëtzebuergesch" - lt_LT = "Lietuvių kalba" - lv_LV = "Latviešu valoda" - mg_MG = "Fiteny malagasy" - mk_MK = "македонски јази" - ml_IN = "മലയാളം" - mn_MN = "монгол" - mr_IN = "मराठी" - nb_NO = "Norsk bokmål" - ne_NP = "नेपाली" - nl_NL = "Nederlands" - nn_NO = "Norsk nynorsk" - pi_IN = "पालि" - pl_PL = "Polski" - ps_AF = "پښتو" - pt_PT = "Português" - pt_BR = "Português (BR)" - ro_RO = "Română" - ru_RU = "русский язык" - ry_UA = "Русинська бисїда" - sa_IN = "संस्कृत" - sk_SK = "Slovenčina" - sl_SI = "Slovenščina" - sq_AL = "Shqip" - sr_SP = "Српски језик" - sv_SE = "Svenska" - sw_KE = "Kiswahili" - ta_IN = "தமிழ்" - tg_TJ = "тоҷикӣ" - th_TH = "ไทย" - tk_TM = "Türkmençe" - tl_PH = "Tagalog" - tp_TP = "Toki pona" - tr_TR = "Türkçe" - uk_UA = "українська" - ur_PK = "اُردُو" - uz_UZ = "oʻzbekcha" - vi_VN = "Tiếng Việt" - yo_NG = "Yorùbá" - zh_CN = "中文" - zh_TW = "繁體中文" - zu_ZA = "isiZulu" - - -code_to_lang = OrderedDict( - [(lang.name.replace("_", "-"), lang.value) for lang in Language] -) - -lang_to_code = OrderedDict( - [(lang.value, lang.name.replace("_", "-")) for lang in Language] -) diff --git a/poetry.lock b/poetry.lock index a8abcc9..8e3c65a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -226,6 +226,68 @@ files = [ {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, ] +[[package]] +name = "lingua-language-detector" +version = "2.0.1" +description = "An accurate natural language detection library, suitable for short text and mixed-language text" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lingua_language_detector-2.0.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:6a8d473de427e7eff54a5eb68fd38d75053dfc7b59e256a7233cc7409435d8ce"}, + {file = "lingua_language_detector-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1b4d67769fe25d903dbc41a63a5d1707913ddd3e574b072c84001cef8472ead0"}, + {file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bcb9c8d273637ca69fa242184c8525c4bc56075d4b174d4adc68d81b11b814be"}, + {file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c42e66f3b20e97f4e4b442b3f2977b5aefb04968535fc0a78ccd4137db5ef34"}, + {file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9563cc878cf9b45d78d9effe9b3b101cb3098b932f2921023e92f984bd5f5120"}, + {file = "lingua_language_detector-2.0.1-cp310-none-win32.whl", hash = "sha256:2cb95b8e8abb40703b0705321cd3394033812812bc18d559a9a26604b241a663"}, + {file = "lingua_language_detector-2.0.1-cp310-none-win_amd64.whl", hash = "sha256:b89d995ac9974b9190036585cbd0b70e6117a2d09f2150a898b332abd1a57636"}, + {file = "lingua_language_detector-2.0.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:06db94dbb5b492924a536dbf97c247b228a3fcb00fe5bef9ca83b6b1aa959ca8"}, + {file = "lingua_language_detector-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4d9dcf69a304819d5d2716943de980cccf140168e7d0243925bb98dd0c661600"}, + {file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7f66ed5cd572d07a1a853017f41bfd94e84e3081cc39690188adfa97337b199f"}, + {file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89c39a2a324d265c44f8ee7c3ffc499506d6307bb484ab1d9565f2d5857697b"}, + {file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae66cae403e36134558e929f8ba0d076be45e968f9fc7bab63869b19b34ddac1"}, + {file = "lingua_language_detector-2.0.1-cp311-none-win32.whl", hash = "sha256:d92f1e0b30233dc1091cb28fe00e0dba8255be609b2337c0dab7f71a2f9b5086"}, + {file = "lingua_language_detector-2.0.1-cp311-none-win_amd64.whl", hash = "sha256:60c1d0136c242097c58874a74d55b26e0c98ed81e44724d6426411b4bf585566"}, + {file = "lingua_language_detector-2.0.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:5531fbc6881149ce905e511e278ac97ed0e92d64b91b99910204058abe057769"}, + {file = "lingua_language_detector-2.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6193a0eac1534593427548d03cbcd82bcac040b3344a2abe67654c15a023c196"}, + {file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:32644b03f107b9cee4a0884096acc21f1900dbec7951ede7c154d19d2a1a6f04"}, + {file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9890f5227a7a767253248dd64fb651e4405256d8376f39216a6ff6e2702a0ee"}, + {file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d286e14105794300217513326e2d8e82911c5efe773c6a37991336fbd63f9e2"}, + {file = "lingua_language_detector-2.0.1-cp312-none-win32.whl", hash = "sha256:ec88318a16467fea3457208ff1052dbd72cc486f9f07caeb4325fa76cab9044c"}, + {file = "lingua_language_detector-2.0.1-cp312-none-win_amd64.whl", hash = "sha256:5fcd53f1a2dc84a00a79c56ca59e4580cfbbb829e5e56249835d31444cf1f9ea"}, + {file = "lingua_language_detector-2.0.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:8efa10f188c40e10016e4f0d2a34d743e1555ddf4bd47553f6dd420f673c0e78"}, + {file = "lingua_language_detector-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ca3f11f372658cda18b309998596de6ffceaf6461e376da9c2861ac9d8b7efa3"}, + {file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5599752ded097132d2a59b2764287240e72e2e9859bb69f43b2957f12d69ac6f"}, + {file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27c1e422e5355d6931f82a1da52f822048be68c5f74d8b997c7d9f9617002e6a"}, + {file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e489cb648684c5bb5a0d6cab2f602fe0bda7e72921f327ba7350e30c60edaa43"}, + {file = "lingua_language_detector-2.0.1-cp38-none-win32.whl", hash = "sha256:5d9f7f4b47c5bde5ff85089bcc4625f2f1a17e7677ec15dadb272b6e4b42c274"}, + {file = "lingua_language_detector-2.0.1-cp38-none-win_amd64.whl", hash = "sha256:493908b45ff237c8c776d1d2b688b113999667b841f284bae862af5f7f526b4f"}, + {file = "lingua_language_detector-2.0.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:e0c269a6099e2a637e2b8a312870792be2bb047abc6e7646122e498a159cc0b4"}, + {file = "lingua_language_detector-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7fcd61ec1c771f452346bc57d756fd079a89e9134b93ef94f5f166e27cda6385"}, + {file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:3728012ace5b8e799df27d1e9272396bc5829a609f1aa799109a22e07e1034c2"}, + {file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0619216bed8746501f0c3c4294ebe1bd55a2d9d72083e68dc0d954f9af5ab12e"}, + {file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde1651655cac14a3b9ea9c3319a700de66a1f73cd07d63e9e653fc84d02f11e"}, + {file = "lingua_language_detector-2.0.1-cp39-none-win32.whl", hash = "sha256:dc1cfcaea24d71d657cf75fb54b593d3db14cf3a19aef6cd1017d0982a407b4e"}, + {file = "lingua_language_detector-2.0.1-cp39-none-win_amd64.whl", hash = "sha256:71d0662458e025aae7a6cbb0cc4e8169ea75e5b615a85e3310964748449896dd"}, + {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:42907729b8a343e91fd3d6e61b6add116c513354b6a88072245a4a1d21a18fb9"}, + {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ed60cb9b9fead792ec97544a3888e7aeda2ae34503c6252a92df599e006d7253"}, + {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fa1a6ab7bec65695650e801e23648742e92651315cf1443e9002a1b88ea2ac41"}, + {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be1bf4faa1e645876c8dfc29b37f8f567451b48d43f45c916aba13b946d5069c"}, + {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c7e52f4e0ca47b787fb5e05560a9e3d7a6bc10488a35a31248f82647314957"}, + {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c91fedd566b07ac1dc6c091bde6d69dae5c12c90e3e3c1d74fe29f76852e775a"}, + {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:16f5fecb6eba86bc094a734ba4bd0603b5bcc580a70c07d659ac2aec14f018ac"}, + {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f21abb598efa1de408275e640a22e8b967a43a9fbb0f32302a206efabf3ca0bc"}, + {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8906262b3866ae4473b5d5f08703addf0b6b12bc9c9aefcf2e2d855c1496d47a"}, + {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc9a6d87e766bf6d2d014986fda13fb8aa6c5602811e856f5555dd8128bd4f2e"}, + {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:650a6df0c1c7c7d650c9872be1f8e4b6ba32ff363d8184f60ee80441cffad779"}, + {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:62d7e29b9e85289bdf80a02a6ef9fcc7b943a5d123eaafa313aad4cfe7b48767"}, + {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ee213e9c0c4a256c2bbe2c1f074c9ac122073045049b6a7f999b507da185dea3"}, + {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06787b555ad9ddae613c0850b2aff991ea3e87c1ef714166d9691f90d1ad366c"}, + {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d983db8f8f0afbf2b4a796e6c8790490868c024492e59ec8b2fca30599c84760"}, +] + +[package.extras] +test = ["pytest (==7.4.3)"] + [[package]] name = "lxml" version = "4.9.3" @@ -555,4 +617,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6" +content-hash = "d6e80d9fee5ef164d85b7b32db698755af31847470468e3c35d655d5f44f95b4" diff --git a/pyproject.toml b/pyproject.toml index 6ea0d68..a162dbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ beautifulsoup4 = "^4.12.2" aiohttp = "^3.8.6" lxml = "^4.9.3" psycopg2 = "^2.9.9" +lingua-language-detector = "^2.0.1" [build-system] requires = ["poetry-core"] @@ -20,5 +21,5 @@ build-backend = "poetry.core.masonry.api" app = "app.__main__:main" [[tool.mypy.overrides]] -module = "aiohttp" +module = ["aiohttp", "lingua"] ignore_missing_imports = true