coach-scraper/coach_scraper/database.py

177 lines
4.9 KiB
Python
Raw Permalink Normal View History

2023-12-07 15:08:53 +00:00
import random
import sys
from datetime import datetime
from typing import List, Literal
import psycopg2
from typing_extensions import TypedDict
2023-12-09 00:48:54 +00:00
from coach_scraper.locale import Locale, locale_to_str, native_to_locale
from coach_scraper.types import Site, Title
2023-12-05 19:54:12 +00:00
SCHEMA_NAME = "coach_scraper"
2023-12-05 22:15:42 +00:00
MAIN_TABLE_NAME = "export"
LANG_TABLE_NAME = "languages"
2023-12-05 19:54:12 +00:00
RowKey = (
Literal["site"]
| Literal["username"]
| Literal["name"]
| Literal["image_url"]
2023-12-07 02:52:40 +00:00
| Literal["title"]
| Literal["languages"]
2023-12-05 19:54:12 +00:00
| Literal["rapid"]
| Literal["blitz"]
| Literal["bullet"]
)
class Row(TypedDict, total=False):
"""Representation of a row of the export table.
The (site, username) make up a unique key for each coach.
"""
# Website the given coach was sourced from.
2023-12-05 19:54:12 +00:00
site: Site
# Username used on the source site.
username: str
# Real name.
name: str
# Profile image used on the source site.
image_url: str
2023-12-07 02:52:40 +00:00
# The FIDE title assigned to the coach on the source siste.
title: Title
# The list of languages the coach is fluent in.
languages: List[Locale]
# Rapid rating relative to the site they were sourced from.
rapid: int
# Blitz rating relative to the site they were sourced from.
blitz: int
# Bullet rating relative to the site they were sourced from.
bullet: int
def load_languages(conn: psycopg2._psycopg.connection):
2023-12-05 22:15:42 +00:00
"""Load all known languages into the languages table."""
cursor = None
try:
cursor = conn.cursor()
for pos, (name, loc) in enumerate(list(native_to_locale.items())):
2023-12-05 22:15:42 +00:00
cursor.execute(
f"""
INSERT INTO {SCHEMA_NAME}.{LANG_TABLE_NAME}
2023-12-05 23:06:04 +00:00
(code, name, pos)
2023-12-05 22:15:42 +00:00
VALUES
2023-12-05 23:06:04 +00:00
(%s, %s, %s)
2023-12-05 22:15:42 +00:00
ON CONFLICT
(code)
DO UPDATE SET
name = EXCLUDED.name;
""",
[locale_to_str(loc), name, pos],
2023-12-05 22:15:42 +00:00
)
conn.commit()
finally:
if cursor:
cursor.close()
def backup_database(conn: psycopg2._psycopg.connection):
"""Creates a backup of the export table.
Simply copies the table at time of invocation into another table with a
`_%t` suffix, where %t denotes the number of seconds since the Unix epoch.
"""
cursor = None
try:
cursor = conn.cursor()
timestamp = int((datetime.now() - datetime(1970, 1, 1)).total_seconds())
2023-12-05 22:15:42 +00:00
for table_name in [MAIN_TABLE_NAME, LANG_TABLE_NAME]:
cursor.execute(
f"""
SELECT 1
FROM information_schema.tables
WHERE table_schema = '{SCHEMA_NAME}'
AND table_name = '{table_name}';
"""
)
result = cursor.fetchone()
if result is None:
print(f"Missing `{SCHEMA_NAME}.{table_name}` table.", file=sys.stderr)
sys.exit(1)
cursor.execute(
f"""
CREATE TABLE {SCHEMA_NAME}.{table_name}_{timestamp}
AS TABLE {SCHEMA_NAME}.{table_name}
"""
)
finally:
if cursor:
cursor.close()
def upsert_row(conn: psycopg2._psycopg.connection, row: Row):
"""Upsert the specified `Row` into the database table."""
cursor = None
try:
cursor = conn.cursor()
cursor.execute(
f"""
2023-12-05 22:15:42 +00:00
INSERT INTO {SCHEMA_NAME}.{MAIN_TABLE_NAME}
( site
, username
, name
, image_url
2023-12-07 02:52:40 +00:00
, title
, languages
, rapid
, blitz
, bullet
2023-12-07 15:08:53 +00:00
, position
)
VALUES
( %s
, %s
, %s
, %s
, %s
, %s
, %s
, %s
2023-12-07 02:52:40 +00:00
, %s
2023-12-07 15:08:53 +00:00
, %s
)
ON CONFLICT
(site, username)
DO UPDATE SET
name = EXCLUDED.name,
image_url = EXCLUDED.image_url,
2023-12-07 02:52:40 +00:00
title = EXCLUDED.title,
languages = EXCLUDED.languages,
rapid = EXCLUDED.rapid,
blitz = EXCLUDED.blitz,
2023-12-07 15:08:53 +00:00
bullet = EXCLUDED.bullet,
position = EXCLUDED.position;
""",
[
row["site"].value,
row["username"],
row.get("name"),
row.get("image_url"),
2023-12-07 02:52:40 +00:00
row["title"].value if "title" in row else None,
list(map(locale_to_str, row.get("languages", []))),
row.get("rapid"),
row.get("blitz"),
row.get("bullet"),
2023-12-07 15:08:53 +00:00
random.randint(0, 1000000),
],
)
conn.commit()
finally:
2023-12-05 19:54:12 +00:00
if cursor:
cursor.close()