From 44a18fc59c32096ae2df00bd99c2ee71c84d0a2f Mon Sep 17 00:00:00 2001
From: Joshua Potter <jrpotter2112@gmail.com>
Date: Wed, 6 Dec 2023 20:53:54 -0700
Subject: [PATCH] Add language detection for chesscom profiles.

---
 app/__main__.py |  24 ++++++--
 app/chesscom.py |  39 ++++++++----
 app/database.py |  11 ++--
 app/lichess.py  |  20 ++++---
 app/locale.py   | 154 ++++++++++++++++++++++++++++++++++++++++++++++++
 app/pipeline.py |  17 ++++--
 app/types.py    | 113 -----------------------------------
 poetry.lock     |  64 +++++++++++++++++++-
 pyproject.toml  |   3 +-
 9 files changed, 296 insertions(+), 149 deletions(-)
 create mode 100644 app/locale.py

diff --git a/app/__main__.py b/app/__main__.py
index f3a37c9..3956f7e 100644
--- a/app/__main__.py
+++ b/app/__main__.py
@@ -4,6 +4,7 @@ from typing import List
 
 import aiohttp
 import psycopg2
+from lingua import LanguageDetector, LanguageDetectorBuilder
 
 from app.chesscom import Pipeline as ChesscomPipeline
 from app.database import backup_database, load_languages
@@ -14,21 +15,31 @@ from app.types import Site
 WORKER_COUNT = 10
 
 
-async def _process(site: Site, conn, session: aiohttp.ClientSession):
+async def _process(
+    site: Site, conn, detector: LanguageDetector, session: aiohttp.ClientSession
+):
     if site == Site.CHESSCOM:
-        await ChesscomPipeline(worker_count=WORKER_COUNT).process(conn, session)
+        await ChesscomPipeline(worker_count=WORKER_COUNT).process(
+            conn, detector, session
+        )
     elif site == Site.LICHESS:
-        await LichessPipeline(worker_count=WORKER_COUNT).process(conn, session)
+        await LichessPipeline(worker_count=WORKER_COUNT).process(
+            conn, detector, session
+        )
     else:
         assert False, f"Encountered unknown site: {site}."
 
 
-async def _entrypoint(conn, user_agent: str, sites: List[Site]):
+async def _entrypoint(
+    conn, detector: LanguageDetector, user_agent: str, sites: List[Site]
+):
     """Top-level entrypoint that dispatches a pipeline per requested site."""
     async with aiohttp.ClientSession(
         headers={"User-Agent": f"BoardWise coach-scraper ({user_agent})"}
     ) as session:
-        await asyncio.gather(*[_process(site, conn, session) for site in sites])
+        await asyncio.gather(
+            *[_process(site, conn, detector, session) for site in sites]
+        )
 
 
 def main():
@@ -58,6 +69,8 @@ def main():
 
     args = parser.parse_args()
 
+    detector = LanguageDetectorBuilder.from_all_languages().build()
+
     conn = None
     try:
         conn = psycopg2.connect(
@@ -72,6 +85,7 @@ def main():
         asyncio.run(
             _entrypoint(
                 conn=conn,
+                detector=detector,
                 user_agent=args.user_agent,
                 sites=list(map(Site, set(args.site))),
             )
diff --git a/app/chesscom.py b/app/chesscom.py
index 2b71439..4fd6aed 100644
--- a/app/chesscom.py
+++ b/app/chesscom.py
@@ -6,7 +6,9 @@ from typing import List
 
 import aiohttp
 from bs4 import BeautifulSoup, SoupStrainer, Tag
+from lingua import LanguageDetector
 
+from app.locale import Locale, lang_to_locale
 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
@@ -97,16 +99,19 @@ class Fetcher(BaseFetcher):
 
 
 def _profile_filter(elem: Tag | str | None, attrs={}) -> bool:
-    if "profile-header-info" in attrs.get("class", ""):
-        return True
-    if "profile-card-info" in attrs.get("class", ""):
-        return True
+    for className in [
+        "profile-header-info",
+        "profile-card-info",
+        "profile-about",
+    ]:
+        if className in attrs.get("class", ""):
+            return True
     return False
 
 
 class Extractor(BaseExtractor):
-    def __init__(self, fetcher: BaseFetcher, username: str):
-        super().__init__(fetcher, username)
+    def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
+        super().__init__(fetcher, detector, username)
 
         self.profile_soup = None
         try:
@@ -164,9 +169,19 @@ class Extractor(BaseExtractor):
         except ValueError:
             return None
 
-    def get_languages(self) -> List[str] | None:
-        # TODO: Extract using huggingface model.
-        return None
+    def get_languages(self) -> List[Locale] | None:
+        if self.profile_soup is None:
+            return None
+        about = self.profile_soup.find("div", class_="profile-about")
+        if not isinstance(about, Tag):
+            return None
+        detected = self.detector.detect_language_of(about.text)
+        if detected is None:
+            return None
+        code = lang_to_locale.get(detected)
+        if code is None:
+            return None
+        return [code]
 
     def get_rapid(self) -> int | None:
         return self.stats_json.get("rapid", {}).get("rating")
@@ -182,5 +197,7 @@ class Pipeline(BasePipeline):
     def get_fetcher(self, session: aiohttp.ClientSession):
         return Fetcher(session)
 
-    def get_extractor(self, fetcher: BaseFetcher, username: str):
-        return Extractor(fetcher, username)
+    def get_extractor(
+        self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
+    ):
+        return Extractor(fetcher, detector, username)
diff --git a/app/database.py b/app/database.py
index b3efcf0..20c6bc7 100644
--- a/app/database.py
+++ b/app/database.py
@@ -4,7 +4,8 @@ from typing import List, Literal
 
 from typing_extensions import TypedDict
 
-from app.types import Site, Title, code_to_lang
+from app.locale import Locale, locale_to_str, native_to_locale
+from app.types import Site, Title
 
 SCHEMA_NAME = "coach_scraper"
 MAIN_TABLE_NAME = "export"
@@ -41,7 +42,7 @@ class Row(TypedDict, total=False):
     # The FIDE title assigned to the coach on the source siste.
     title: Title
     # The list of languages the coach is fluent in.
-    languages: List[str]
+    languages: List[Locale]
     # Rapid rating relative to the site they were sourced from.
     rapid: int
     # Blitz rating relative to the site they were sourced from.
@@ -55,7 +56,7 @@ def load_languages(conn):
     cursor = None
     try:
         cursor = conn.cursor()
-        for pos, (code, name) in enumerate(list(code_to_lang.items())):
+        for pos, (name, loc) in enumerate(list(native_to_locale.items())):
             cursor.execute(
                 f"""
                 INSERT INTO {SCHEMA_NAME}.{LANG_TABLE_NAME}
@@ -67,7 +68,7 @@ def load_languages(conn):
                 DO UPDATE SET
                   name = EXCLUDED.name;
                 """,
-                [code, name, pos],
+                [locale_to_str(loc), name, pos],
             )
         conn.commit()
     finally:
@@ -157,7 +158,7 @@ def upsert_row(conn, row: Row):
                 row.get("name"),
                 row.get("image_url"),
                 row["title"].value if "title" in row else None,
-                row.get("languages", []),
+                list(map(locale_to_str, row.get("languages", []))),
                 row.get("rapid"),
                 row.get("blitz"),
                 row.get("bullet"),
diff --git a/app/lichess.py b/app/lichess.py
index d151793..b4f7480 100644
--- a/app/lichess.py
+++ b/app/lichess.py
@@ -5,11 +5,13 @@ from typing import List
 
 import aiohttp
 from bs4 import BeautifulSoup, SoupStrainer, Tag
+from lingua import LanguageDetector
 
+from app.locale import Locale, native_to_locale
 from app.pipeline import Extractor as BaseExtractor
 from app.pipeline import Fetcher as BaseFetcher
 from app.pipeline import Pipeline as BasePipeline
-from app.types import Site, Title, lang_to_code
+from app.types import Site, Title
 
 # The number of pages we will at most iterate through. This number was
 # determined by going to https://lichess.org/coach/all/all/alphabetical
@@ -113,8 +115,8 @@ def _stats_filter(elem: Tag | str | None, attrs={}) -> bool:
 
 
 class Extractor(BaseExtractor):
-    def __init__(self, fetcher: BaseFetcher, username: str):
-        super().__init__(fetcher, username)
+    def __init__(self, fetcher: BaseFetcher, detector: LanguageDetector, username: str):
+        super().__init__(fetcher, detector, username)
 
         self.profile_soup = None
         try:
@@ -175,7 +177,7 @@ class Extractor(BaseExtractor):
         except ValueError:
             return None
 
-    def get_languages(self) -> List[str] | None:
+    def get_languages(self) -> List[Locale] | None:
         if self.profile_soup is None:
             return None
         tr = self.profile_soup.find("tr", class_="languages")
@@ -187,8 +189,8 @@ class Extractor(BaseExtractor):
 
         codes = []
         for lang in [s.strip() for s in tr.get_text().split(",")]:
-            if lang in lang_to_code:
-                codes.append(lang_to_code[lang])
+            if lang in native_to_locale:
+                codes.append(native_to_locale[lang])
         return codes
 
     def get_rapid(self) -> int | None:
@@ -225,5 +227,7 @@ class Pipeline(BasePipeline):
     def get_fetcher(self, session: aiohttp.ClientSession):
         return Fetcher(session)
 
-    def get_extractor(self, fetcher: BaseFetcher, username: str):
-        return Extractor(fetcher, username)
+    def get_extractor(
+        self, fetcher: BaseFetcher, detector: LanguageDetector, username: str
+    ):
+        return Extractor(fetcher, detector, username)
diff --git a/app/locale.py b/app/locale.py
new file mode 100644
index 0000000..7cf1a2a
--- /dev/null
+++ b/app/locale.py
@@ -0,0 +1,154 @@
+import enum
+from collections import OrderedDict
+from typing import Dict
+
+from lingua import Language
+
+
+class Locale(enum.Enum):
+    """Maps {language}_{country} to the name of the langage in said language."""
+
+    en_GB = "English"
+    af_ZA = "Afrikaans"
+    an_ES = "Aragonés"
+    ar_SA = "العربية"
+    as_IN = "অসমীয়া"
+    av_DA = "авар мацӀ"
+    az_AZ = "Azərbaycanca"
+    be_BY = "Беларуская"
+    bg_BG = "български език"
+    bn_BD = "বাংলা"
+    br_FR = "Brezhoneg"
+    bs_BA = "Bosanski"
+    ca_ES = "Català, valencià"
+    ckb_IR = "کوردی سۆرانی"
+    co_FR = "Corsu"
+    cs_CZ = "Čeština"
+    cv_CU = "чӑваш чӗлхи"
+    cy_GB = "Cymraeg"
+    da_DK = "Dansk"
+    de_DE = "Deutsch"
+    el_GR = "Ελληνικά"
+    en_US = "English (US)"
+    eo_UY = "Esperanto"
+    es_ES = "Español"
+    et_EE = "Eesti keel"
+    eu_ES = "Euskara"
+    fa_IR = "فارسی"
+    fi_FI = "Suomen kieli"
+    fo_FO = "Føroyskt"
+    fr_FR = "Français"
+    frp_IT = "Arpitan"
+    fy_NL = "Frysk"
+    ga_IE = "Gaeilge"
+    gd_GB = "Gàidhlig"
+    gl_ES = "Galego"
+    gsw_CH = "Schwizerdütsch"
+    gu_IN = "ગુજરાતી"
+    he_IL = "עִבְרִית"
+    hi_IN = "हिन्दी, हिंदी"
+    hr_HR = "Hrvatski"
+    hu_HU = "Magyar"
+    hy_AM = "Հայերեն"
+    ia_IA = "Interlingua"
+    id_ID = "Bahasa Indonesia"
+    io_EN = "Ido"
+    is_IS = "Íslenska"
+    it_IT = "Italiano"
+    ja_JP = "日本語"
+    jbo_EN = "Lojban"
+    jv_ID = "Basa Jawa"
+    ka_GE = "ქართული"
+    kab_DZ = "Taqvaylit"
+    kk_KZ = "қазақша"
+    kmr_TR = "Kurdî (Kurmancî)"
+    kn_IN = "ಕನ್ನಡ"
+    ko_KR = "한국어"
+    ky_KG = "кыргызча"
+    la_LA = "Lingua Latina"
+    lb_LU = "Lëtzebuergesch"
+    lt_LT = "Lietuvių kalba"
+    lv_LV = "Latviešu valoda"
+    mg_MG = "Fiteny malagasy"
+    mk_MK = "македонски јази"
+    ml_IN = "മലയാളം"
+    mn_MN = "монгол"
+    mr_IN = "मराठी"
+    ms_MY = "Melayu"
+    nb_NO = "Norsk bokmål"
+    ne_NP = "नेपाली"
+    nl_NL = "Nederlands"
+    nn_NO = "Norsk nynorsk"
+    pi_IN = "पालि"
+    pl_PL = "Polski"
+    ps_AF = "پښتو"
+    pt_PT = "Português"
+    pt_BR = "Português (BR)"
+    ro_RO = "Română"
+    ru_RU = "русский язык"
+    ry_UA = "Русинська бисїда"
+    sa_IN = "संस्कृत"
+    sk_SK = "Slovenčina"
+    sl_SI = "Slovenščina"
+    sq_AL = "Shqip"
+    sr_SP = "Српски језик"
+    sv_SE = "Svenska"
+    sw_KE = "Kiswahili"
+    ta_IN = "தமிழ்"
+    tg_TJ = "тоҷикӣ"
+    th_TH = "ไทย"
+    tk_TM = "Türkmençe"
+    tl_PH = "Tagalog"
+    tp_TP = "Toki pona"
+    tr_TR = "Türkçe"
+    uk_UA = "українська"
+    ur_PK = "اُردُو"
+    uz_UZ = "oʻzbekcha"
+    vi_VN = "Tiếng Việt"
+    yo_NG = "Yorùbá"
+    zh_CN = "中文"
+    zh_TW = "繁體中文"
+    zu_ZA = "isiZulu"
+
+
+def locale_to_str(loc: Locale) -> str:
+    return loc.name.replace("_", "-")
+
+
+# Uses the name of the language (in said language) as the key.
+native_to_locale: OrderedDict[str, Locale] = OrderedDict(
+    [(loc.value, loc) for loc in Locale]
+)
+
+# Uses an inferred/detected language as the key. Mapping was manually created
+# using https://github.com/pemistahl/lingua-rs/blob/main/src/isocode.rs#L40 as
+# a reference.
+lang_to_locale: Dict[Language, Locale] = {
+    Language.CHINESE: Locale.zh_CN,
+    Language.CROATIAN: Locale.hr_HR,
+    Language.DANISH: Locale.da_DK,
+    Language.DUTCH: Locale.nl_NL,
+    Language.ENGLISH: Locale.en_GB,
+    Language.FINNISH: Locale.fi_FI,
+    Language.FRENCH: Locale.fr_FR,
+    Language.GERMAN: Locale.de_DE,
+    Language.HUNGARIAN: Locale.hu_HU,
+    Language.ITALIAN: Locale.it_IT,
+    Language.KOREAN: Locale.ko_KR,
+    Language.LATIN: Locale.la_LA,
+    Language.MALAY: Locale.ms_MY,
+    Language.PERSIAN: Locale.fa_IR,
+    Language.POLISH: Locale.pl_PL,
+    Language.PORTUGUESE: Locale.pt_PT,
+    Language.ROMANIAN: Locale.ro_RO,
+    Language.RUSSIAN: Locale.ru_RU,
+    Language.SLOVENE: Locale.sl_SI,
+    Language.SPANISH: Locale.es_ES,
+    Language.SWAHILI: Locale.sw_KE,
+    Language.SWEDISH: Locale.sv_SE,
+    Language.TAGALOG: Locale.tl_PH,
+    Language.TURKISH: Locale.tr_TR,
+    Language.UKRAINIAN: Locale.uk_UA,
+    Language.VIETNAMESE: Locale.vi_VN,
+    Language.YORUBA: Locale.yo_NG,
+}
diff --git a/app/pipeline.py b/app/pipeline.py
index 96ba96f..7ea9cce 100644
--- a/app/pipeline.py
+++ b/app/pipeline.py
@@ -3,8 +3,10 @@ import os.path
 from typing import Any, List, Tuple
 
 import aiohttp
+from lingua import LanguageDetector
 
 from app.database import Row, RowKey, upsert_row
+from app.locale import Locale
 from app.types import Site, Title
 
 
@@ -94,8 +96,9 @@ def _insert(row: Row, key: RowKey, value: Any):
 
 
 class Extractor:
-    def __init__(self, fetcher: Fetcher, username: str):
+    def __init__(self, fetcher: Fetcher, detector: LanguageDetector, username: str):
         self.fetcher = fetcher
+        self.detector = detector
         self.username = username
 
     def get_name(self) -> str | None:
@@ -107,7 +110,7 @@ class Extractor:
     def get_title(self) -> Title | None:
         raise NotImplementedError()
 
-    def get_languages(self) -> List[str] | None:
+    def get_languages(self) -> List[Locale] | None:
         raise NotImplementedError()
 
     def get_rapid(self) -> int | None:
@@ -157,10 +160,14 @@ class Pipeline:
     def get_fetcher(self, session: aiohttp.ClientSession) -> Fetcher:
         raise NotImplementedError()
 
-    def get_extractor(self, fetcher: Fetcher, username: str) -> Extractor:
+    def get_extractor(
+        self, fetcher: Fetcher, detector: LanguageDetector, username: str
+    ) -> Extractor:
         raise NotImplementedError()
 
-    async def process(self, conn, session: aiohttp.ClientSession):
+    async def process(
+        self, conn, detector: LanguageDetector, session: aiohttp.ClientSession
+    ):
         fetcher = self.get_fetcher(session)
 
         queue: asyncio.Queue = asyncio.Queue()
@@ -180,7 +187,7 @@ class Pipeline:
             page_no += 1
             for username in usernames or []:
                 await fetcher._download_user_files(username)
-                extractor = self.get_extractor(fetcher, username)
+                extractor = self.get_extractor(fetcher, detector, username)
                 queue.put_nowait((conn, extractor))
 
         # Wait until the queue is fully processed.
diff --git a/app/types.py b/app/types.py
index a53797b..e9410bd 100644
--- a/app/types.py
+++ b/app/types.py
@@ -1,5 +1,4 @@
 import enum
-from collections import OrderedDict
 
 
 class Site(enum.Enum):
@@ -18,115 +17,3 @@ class Title(enum.Enum):
     WFM = "WFM"
     WCM = "WCM"
     WNM = "WNM"
-
-
-class Language(enum.Enum):
-    en_GB = "English"
-    af_ZA = "Afrikaans"
-    an_ES = "Aragonés"
-    ar_SA = "العربية"
-    as_IN = "অসমীয়া"
-    av_DA = "авар мацӀ"
-    az_AZ = "Azərbaycanca"
-    be_BY = "Беларуская"
-    bg_BG = "български език"
-    bn_BD = "বাংলা"
-    br_FR = "Brezhoneg"
-    bs_BA = "Bosanski"
-    ca_ES = "Català, valencià"
-    ckb_IR = "کوردی سۆرانی"
-    co_FR = "Corsu"
-    cs_CZ = "Čeština"
-    cv_CU = "чӑваш чӗлхи"
-    cy_GB = "Cymraeg"
-    da_DK = "Dansk"
-    de_DE = "Deutsch"
-    el_GR = "Ελληνικά"
-    en_US = "English (US)"
-    eo_UY = "Esperanto"
-    es_ES = "Español"
-    et_EE = "Eesti keel"
-    eu_ES = "Euskara"
-    fa_IR = "فارسی"
-    fi_FI = "Suomen kieli"
-    fo_FO = "Føroyskt"
-    fr_FR = "Français"
-    frp_IT = "Arpitan"
-    fy_NL = "Frysk"
-    ga_IE = "Gaeilge"
-    gd_GB = "Gàidhlig"
-    gl_ES = "Galego"
-    gsw_CH = "Schwizerdütsch"
-    gu_IN = "ગુજરાતી"
-    he_IL = "עִבְרִית"
-    hi_IN = "हिन्दी, हिंदी"
-    hr_HR = "Hrvatski"
-    hu_HU = "Magyar"
-    hy_AM = "Հայերեն"
-    ia_IA = "Interlingua"
-    id_ID = "Bahasa Indonesia"
-    io_EN = "Ido"
-    is_IS = "Íslenska"
-    it_IT = "Italiano"
-    ja_JP = "日本語"
-    jbo_EN = "Lojban"
-    jv_ID = "Basa Jawa"
-    ka_GE = "ქართული"
-    kab_DZ = "Taqvaylit"
-    kk_KZ = "қазақша"
-    kmr_TR = "Kurdî (Kurmancî)"
-    kn_IN = "ಕನ್ನಡ"
-    ko_KR = "한국어"
-    ky_KG = "кыргызча"
-    la_LA = "Lingua Latina"
-    lb_LU = "Lëtzebuergesch"
-    lt_LT = "Lietuvių kalba"
-    lv_LV = "Latviešu valoda"
-    mg_MG = "Fiteny malagasy"
-    mk_MK = "македонски јази"
-    ml_IN = "മലയാളം"
-    mn_MN = "монгол"
-    mr_IN = "मराठी"
-    nb_NO = "Norsk bokmål"
-    ne_NP = "नेपाली"
-    nl_NL = "Nederlands"
-    nn_NO = "Norsk nynorsk"
-    pi_IN = "पालि"
-    pl_PL = "Polski"
-    ps_AF = "پښتو"
-    pt_PT = "Português"
-    pt_BR = "Português (BR)"
-    ro_RO = "Română"
-    ru_RU = "русский язык"
-    ry_UA = "Русинська бисїда"
-    sa_IN = "संस्कृत"
-    sk_SK = "Slovenčina"
-    sl_SI = "Slovenščina"
-    sq_AL = "Shqip"
-    sr_SP = "Српски језик"
-    sv_SE = "Svenska"
-    sw_KE = "Kiswahili"
-    ta_IN = "தமிழ்"
-    tg_TJ = "тоҷикӣ"
-    th_TH = "ไทย"
-    tk_TM = "Türkmençe"
-    tl_PH = "Tagalog"
-    tp_TP = "Toki pona"
-    tr_TR = "Türkçe"
-    uk_UA = "українська"
-    ur_PK = "اُردُو"
-    uz_UZ = "oʻzbekcha"
-    vi_VN = "Tiếng Việt"
-    yo_NG = "Yorùbá"
-    zh_CN = "中文"
-    zh_TW = "繁體中文"
-    zu_ZA = "isiZulu"
-
-
-code_to_lang = OrderedDict(
-    [(lang.name.replace("_", "-"), lang.value) for lang in Language]
-)
-
-lang_to_code = OrderedDict(
-    [(lang.value, lang.name.replace("_", "-")) for lang in Language]
-)
diff --git a/poetry.lock b/poetry.lock
index a8abcc9..8e3c65a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -226,6 +226,68 @@ files = [
     {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
 ]
 
+[[package]]
+name = "lingua-language-detector"
+version = "2.0.1"
+description = "An accurate natural language detection library, suitable for short text and mixed-language text"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "lingua_language_detector-2.0.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:6a8d473de427e7eff54a5eb68fd38d75053dfc7b59e256a7233cc7409435d8ce"},
+    {file = "lingua_language_detector-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1b4d67769fe25d903dbc41a63a5d1707913ddd3e574b072c84001cef8472ead0"},
+    {file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bcb9c8d273637ca69fa242184c8525c4bc56075d4b174d4adc68d81b11b814be"},
+    {file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c42e66f3b20e97f4e4b442b3f2977b5aefb04968535fc0a78ccd4137db5ef34"},
+    {file = "lingua_language_detector-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9563cc878cf9b45d78d9effe9b3b101cb3098b932f2921023e92f984bd5f5120"},
+    {file = "lingua_language_detector-2.0.1-cp310-none-win32.whl", hash = "sha256:2cb95b8e8abb40703b0705321cd3394033812812bc18d559a9a26604b241a663"},
+    {file = "lingua_language_detector-2.0.1-cp310-none-win_amd64.whl", hash = "sha256:b89d995ac9974b9190036585cbd0b70e6117a2d09f2150a898b332abd1a57636"},
+    {file = "lingua_language_detector-2.0.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:06db94dbb5b492924a536dbf97c247b228a3fcb00fe5bef9ca83b6b1aa959ca8"},
+    {file = "lingua_language_detector-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4d9dcf69a304819d5d2716943de980cccf140168e7d0243925bb98dd0c661600"},
+    {file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7f66ed5cd572d07a1a853017f41bfd94e84e3081cc39690188adfa97337b199f"},
+    {file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89c39a2a324d265c44f8ee7c3ffc499506d6307bb484ab1d9565f2d5857697b"},
+    {file = "lingua_language_detector-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae66cae403e36134558e929f8ba0d076be45e968f9fc7bab63869b19b34ddac1"},
+    {file = "lingua_language_detector-2.0.1-cp311-none-win32.whl", hash = "sha256:d92f1e0b30233dc1091cb28fe00e0dba8255be609b2337c0dab7f71a2f9b5086"},
+    {file = "lingua_language_detector-2.0.1-cp311-none-win_amd64.whl", hash = "sha256:60c1d0136c242097c58874a74d55b26e0c98ed81e44724d6426411b4bf585566"},
+    {file = "lingua_language_detector-2.0.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:5531fbc6881149ce905e511e278ac97ed0e92d64b91b99910204058abe057769"},
+    {file = "lingua_language_detector-2.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6193a0eac1534593427548d03cbcd82bcac040b3344a2abe67654c15a023c196"},
+    {file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:32644b03f107b9cee4a0884096acc21f1900dbec7951ede7c154d19d2a1a6f04"},
+    {file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9890f5227a7a767253248dd64fb651e4405256d8376f39216a6ff6e2702a0ee"},
+    {file = "lingua_language_detector-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d286e14105794300217513326e2d8e82911c5efe773c6a37991336fbd63f9e2"},
+    {file = "lingua_language_detector-2.0.1-cp312-none-win32.whl", hash = "sha256:ec88318a16467fea3457208ff1052dbd72cc486f9f07caeb4325fa76cab9044c"},
+    {file = "lingua_language_detector-2.0.1-cp312-none-win_amd64.whl", hash = "sha256:5fcd53f1a2dc84a00a79c56ca59e4580cfbbb829e5e56249835d31444cf1f9ea"},
+    {file = "lingua_language_detector-2.0.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:8efa10f188c40e10016e4f0d2a34d743e1555ddf4bd47553f6dd420f673c0e78"},
+    {file = "lingua_language_detector-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ca3f11f372658cda18b309998596de6ffceaf6461e376da9c2861ac9d8b7efa3"},
+    {file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5599752ded097132d2a59b2764287240e72e2e9859bb69f43b2957f12d69ac6f"},
+    {file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27c1e422e5355d6931f82a1da52f822048be68c5f74d8b997c7d9f9617002e6a"},
+    {file = "lingua_language_detector-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e489cb648684c5bb5a0d6cab2f602fe0bda7e72921f327ba7350e30c60edaa43"},
+    {file = "lingua_language_detector-2.0.1-cp38-none-win32.whl", hash = "sha256:5d9f7f4b47c5bde5ff85089bcc4625f2f1a17e7677ec15dadb272b6e4b42c274"},
+    {file = "lingua_language_detector-2.0.1-cp38-none-win_amd64.whl", hash = "sha256:493908b45ff237c8c776d1d2b688b113999667b841f284bae862af5f7f526b4f"},
+    {file = "lingua_language_detector-2.0.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:e0c269a6099e2a637e2b8a312870792be2bb047abc6e7646122e498a159cc0b4"},
+    {file = "lingua_language_detector-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7fcd61ec1c771f452346bc57d756fd079a89e9134b93ef94f5f166e27cda6385"},
+    {file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:3728012ace5b8e799df27d1e9272396bc5829a609f1aa799109a22e07e1034c2"},
+    {file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0619216bed8746501f0c3c4294ebe1bd55a2d9d72083e68dc0d954f9af5ab12e"},
+    {file = "lingua_language_detector-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde1651655cac14a3b9ea9c3319a700de66a1f73cd07d63e9e653fc84d02f11e"},
+    {file = "lingua_language_detector-2.0.1-cp39-none-win32.whl", hash = "sha256:dc1cfcaea24d71d657cf75fb54b593d3db14cf3a19aef6cd1017d0982a407b4e"},
+    {file = "lingua_language_detector-2.0.1-cp39-none-win_amd64.whl", hash = "sha256:71d0662458e025aae7a6cbb0cc4e8169ea75e5b615a85e3310964748449896dd"},
+    {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:42907729b8a343e91fd3d6e61b6add116c513354b6a88072245a4a1d21a18fb9"},
+    {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ed60cb9b9fead792ec97544a3888e7aeda2ae34503c6252a92df599e006d7253"},
+    {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fa1a6ab7bec65695650e801e23648742e92651315cf1443e9002a1b88ea2ac41"},
+    {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be1bf4faa1e645876c8dfc29b37f8f567451b48d43f45c916aba13b946d5069c"},
+    {file = "lingua_language_detector-2.0.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31c7e52f4e0ca47b787fb5e05560a9e3d7a6bc10488a35a31248f82647314957"},
+    {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c91fedd566b07ac1dc6c091bde6d69dae5c12c90e3e3c1d74fe29f76852e775a"},
+    {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:16f5fecb6eba86bc094a734ba4bd0603b5bcc580a70c07d659ac2aec14f018ac"},
+    {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f21abb598efa1de408275e640a22e8b967a43a9fbb0f32302a206efabf3ca0bc"},
+    {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8906262b3866ae4473b5d5f08703addf0b6b12bc9c9aefcf2e2d855c1496d47a"},
+    {file = "lingua_language_detector-2.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc9a6d87e766bf6d2d014986fda13fb8aa6c5602811e856f5555dd8128bd4f2e"},
+    {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:650a6df0c1c7c7d650c9872be1f8e4b6ba32ff363d8184f60ee80441cffad779"},
+    {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:62d7e29b9e85289bdf80a02a6ef9fcc7b943a5d123eaafa313aad4cfe7b48767"},
+    {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ee213e9c0c4a256c2bbe2c1f074c9ac122073045049b6a7f999b507da185dea3"},
+    {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06787b555ad9ddae613c0850b2aff991ea3e87c1ef714166d9691f90d1ad366c"},
+    {file = "lingua_language_detector-2.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d983db8f8f0afbf2b4a796e6c8790490868c024492e59ec8b2fca30599c84760"},
+]
+
+[package.extras]
+test = ["pytest (==7.4.3)"]
+
 [[package]]
 name = "lxml"
 version = "4.9.3"
@@ -555,4 +617,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "4a667c2246018c259f23d7fe07102b3b680693beccbc685467b1e29c2c3e2db6"
+content-hash = "d6e80d9fee5ef164d85b7b32db698755af31847470468e3c35d655d5f44f95b4"
diff --git a/pyproject.toml b/pyproject.toml
index 6ea0d68..a162dbd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ beautifulsoup4 = "^4.12.2"
 aiohttp = "^3.8.6"
 lxml = "^4.9.3"
 psycopg2 = "^2.9.9"
+lingua-language-detector = "^2.0.1"
 
 [build-system]
 requires = ["poetry-core"]
@@ -20,5 +21,5 @@ build-backend = "poetry.core.masonry.api"
 app = "app.__main__:main"
 
 [[tool.mypy.overrides]]
-module = "aiohttp"
+module = ["aiohttp", "lingua"]
 ignore_missing_imports = true