Transition to a CSV; Postgres can handle that better.
parent
ec94a16140
commit
63764a22c4
27
README.md
27
README.md
|
@ -29,17 +29,19 @@ data
|
|||
├── ...
|
||||
```
|
||||
|
||||
## Usage
|
||||
## Quickstart
|
||||
|
||||
If you have nix available, run:
|
||||
```bash
|
||||
$ nix run . -- --user-agent <your-email> -s <site>
|
||||
$ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
|
||||
```
|
||||
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
||||
instead run the following:
|
||||
```bash
|
||||
$ poetry run python3 -m app -u <your-email> -s <site>
|
||||
$ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
|
||||
```
|
||||
After running (this may take several hours), a new CSV will be generated at
|
||||
`data/export.csv` containing all scraped content from the specified `<site>`s.
|
||||
|
||||
## Database
|
||||
|
||||
|
@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
|
|||
```bash
|
||||
$ psql -h @scraper -f sql/init.sql
|
||||
```
|
||||
Next, concatenate all exported content and dump into the newly created table:
|
||||
Next, dump exported data into the newly created table:
|
||||
```bash
|
||||
$ cat data/{chesscom,lichess}/export.json > data/export.json
|
||||
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
|
||||
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
|
||||
```
|
||||
Re-running the `sql/export.sql` script will create a backup of the
|
||||
`coach_scraper.export` table. It will then upsert the scraped data. You can view
|
||||
all backups from the `psql` console like so:
|
||||
```
|
||||
postgres=# \dt coach_scraper.export*
|
||||
```
|
||||
Re-running will automatically create backups and replace the coach data found
|
||||
in `coach_scraper.export`.
|
||||
|
||||
### E2E
|
||||
|
||||
|
@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
|
|||
the database in one fell swoop. Assuming our database is open with a socket
|
||||
connection available at `@scraper`:
|
||||
```bash
|
||||
nix run . -- --user-agent <your-email> -s chesscom -s lichess
|
||||
cat data/{chesscom,lichess}/export.json > data/export.json
|
||||
psql -h @scraper -f sql/init.sql
|
||||
psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
|
||||
$ nix run . -- --user-agent <your-email> -s chesscom -s lichess
|
||||
$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
|
||||
```
|
||||
|
||||
## Development
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
|
||||
import aiohttp
|
||||
|
@ -32,6 +33,8 @@ async def run():
|
|||
async with aiohttp.ClientSession(
|
||||
headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
|
||||
) as session:
|
||||
with open("data/export.csv", "w") as f:
|
||||
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||
for site in set(args.site):
|
||||
scraper, exporter_cls = None, None
|
||||
|
||||
|
@ -42,15 +45,22 @@ async def run():
|
|||
scraper = LichessScraper(session)
|
||||
exporter_cls = LichessExporter
|
||||
|
||||
# Write out each coach data into NDJSON file.
|
||||
dump = []
|
||||
usernames = await scraper.scrape()
|
||||
for username in usernames:
|
||||
export = exporter_cls(username).export()
|
||||
dump.append(f"{json.dumps(export)}\n")
|
||||
|
||||
with open(scraper.path_site_file("export.json"), "w") as f:
|
||||
f.writelines(dump)
|
||||
writer.writerow(
|
||||
[
|
||||
# This should match the order data is loaded in the
|
||||
# sql/export.sql script.
|
||||
export["site"],
|
||||
export["username"],
|
||||
export.get("name", ""),
|
||||
export.get("image_url", ""),
|
||||
export.get("rapid", ""),
|
||||
export.get("blitz", ""),
|
||||
export.get("bullet", ""),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -5,7 +5,7 @@ import os.path
|
|||
from typing import List, Union
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, SoupStrainer
|
||||
|
||||
from app.exporter import BaseExporter
|
||||
from app.repo import AnsiColor, Site
|
||||
|
@ -172,10 +172,27 @@ class Scraper(BaseScraper):
|
|||
return True
|
||||
|
||||
|
||||
def _profile_filter(elem, attrs):
|
||||
"""Includes only relevant segments of the `{username}.html` file."""
|
||||
if "profile-header-info" in attrs.get("class", ""):
|
||||
return True
|
||||
if "profile-card-info" in attrs.get("class", ""):
|
||||
return True
|
||||
|
||||
|
||||
class Exporter(BaseExporter):
|
||||
def __init__(self, username: str):
|
||||
super().__init__(site=Site.CHESSCOM.value, username=username)
|
||||
|
||||
self.profile_soup = None
|
||||
try:
|
||||
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
|
||||
self.profile_soup = BeautifulSoup(
|
||||
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
|
||||
)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
self.stats_json = {}
|
||||
try:
|
||||
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
||||
|
@ -185,6 +202,22 @@ class Exporter(BaseExporter):
|
|||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def export_name(self) -> Union[str, None]:
|
||||
try:
|
||||
name = self.profile_soup.find("div", class_="profile-card-name")
|
||||
return name.get_text().strip()
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
def export_image_url(self) -> Union[str, None]:
|
||||
try:
|
||||
div = self.profile_soup.find("div", class_="profile-header-avatar")
|
||||
src = div.find("img").get("src", "")
|
||||
if "images.chesscomfiles.com" in src:
|
||||
return src
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
def export_rapid(self) -> Union[int, None]:
|
||||
return self.stats_json.get("rapid", {}).get("rating")
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Union
|
||||
from typing import Any, Union
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
|
@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo
|
|||
|
||||
|
||||
class Export(TypedDict, total=False):
|
||||
# The (site, username) make up a unique key for each coach.
|
||||
site: str
|
||||
username: str
|
||||
# The coach's real name.
|
||||
name: str
|
||||
# The profile image used on the source site.
|
||||
image_url: str
|
||||
# The coach's rapid rating relative to the site they were sourced from.
|
||||
rapid: int
|
||||
# The coach's blitz rating relative to the site they were sourced from.
|
||||
|
@ -14,9 +21,8 @@ class Export(TypedDict, total=False):
|
|||
bullet: int
|
||||
|
||||
|
||||
def _insert(export: Export, key: str, value: any):
|
||||
if value is None:
|
||||
return
|
||||
def _insert(export: Export, key: str, value: Any):
|
||||
if value is not None:
|
||||
export[key] = value
|
||||
|
||||
|
||||
|
@ -25,6 +31,12 @@ class BaseExporter(Repo):
|
|||
super().__init__(site)
|
||||
self.username = username
|
||||
|
||||
def export_name(self) -> Union[str, None]:
|
||||
raise NotImplementedError()
|
||||
|
||||
def export_image_url(self) -> Union[str, None]:
|
||||
raise NotImplementedError()
|
||||
|
||||
def export_rapid(self) -> Union[int, None]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -40,6 +52,8 @@ class BaseExporter(Repo):
|
|||
|
||||
_insert(export, "site", self.site)
|
||||
_insert(export, "username", self.username)
|
||||
_insert(export, "name", self.export_name())
|
||||
_insert(export, "image_url", self.export_image_url())
|
||||
_insert(export, "rapid", self.export_rapid())
|
||||
_insert(export, "blitz", self.export_blitz())
|
||||
_insert(export, "bullet", self.export_bullet())
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import asyncio
|
||||
import os
|
||||
import os.path
|
||||
from typing import List
|
||||
from typing import List, Union
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup, SoupStrainer
|
||||
|
@ -97,9 +97,9 @@ class Scraper(BaseScraper):
|
|||
soup = BeautifulSoup(response, "lxml")
|
||||
members = soup.find_all("article", class_="coach-widget")
|
||||
for member in members:
|
||||
anchor = member.find("a", class_="overlay")
|
||||
if anchor:
|
||||
href = anchor.get("href")
|
||||
a = member.find("a", class_="overlay")
|
||||
if a:
|
||||
href = a.get("href")
|
||||
username = href[len("/coach/") :]
|
||||
usernames.append(username)
|
||||
|
||||
|
@ -163,8 +163,16 @@ class Scraper(BaseScraper):
|
|||
return True
|
||||
|
||||
|
||||
def _profile_filter(elem, attrs):
|
||||
"""Includes only relevant segments of the `{username}.html` file."""
|
||||
if "coach-widget" in attrs.get("class", ""):
|
||||
return True
|
||||
|
||||
|
||||
def _stats_filter(elem, attrs):
|
||||
"""Includes only relevant segments of the `stats.html` file."""
|
||||
if "profile-side" in attrs.get("class", ""):
|
||||
return True
|
||||
if "sub-ratings" in attrs.get("class", ""):
|
||||
return True
|
||||
|
||||
|
@ -173,43 +181,59 @@ class Exporter(BaseExporter):
|
|||
def __init__(self, username: str):
|
||||
super().__init__(site=Site.LICHESS.value, username=username)
|
||||
|
||||
self.stats_soup = None
|
||||
self.profile_soup = None
|
||||
try:
|
||||
with open(self.path_coach_file(username, "stats.html"), "r") as f:
|
||||
stats_strainer = SoupStrainer(_stats_filter)
|
||||
self.stats_soup = BeautifulSoup(
|
||||
f.read(), "lxml", parse_only=stats_strainer
|
||||
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
|
||||
self.profile_soup = BeautifulSoup(
|
||||
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
|
||||
)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def export_rapid(self):
|
||||
self.stats_soup = None
|
||||
try:
|
||||
with open(self.path_coach_file(username, "stats.html"), "r") as f:
|
||||
self.stats_soup = BeautifulSoup(
|
||||
f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
|
||||
)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def export_name(self) -> Union[str, None]:
|
||||
try:
|
||||
profile_side = self.stats_soup.find("div", class_="profile-side")
|
||||
user_infos = profile_side.find("div", class_="user-infos")
|
||||
name = user_infos.find("strong", class_="name")
|
||||
return name.get_text().strip()
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
def export_image_url(self) -> Union[str, None]:
|
||||
try:
|
||||
picture = self.profile_soup.find("img", class_="picture")
|
||||
src = picture.get("src", "")
|
||||
if "image.lichess1.org" in src:
|
||||
return src
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
def export_rapid(self) -> Union[int, None]:
|
||||
return self._find_rating("rapid")
|
||||
|
||||
def export_blitz(self):
|
||||
def export_blitz(self) -> Union[int, None]:
|
||||
return self._find_rating("blitz")
|
||||
|
||||
def export_bullet(self):
|
||||
def export_bullet(self) -> Union[int, None]:
|
||||
return self._find_rating("bullet")
|
||||
|
||||
def _find_rating(self, name):
|
||||
if self.stats_soup is None:
|
||||
return None
|
||||
|
||||
anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
|
||||
if anchor is None:
|
||||
return None
|
||||
rating = anchor.find("rating")
|
||||
if rating is None:
|
||||
return None
|
||||
def _find_rating(self, name) -> Union[int, None]:
|
||||
try:
|
||||
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
|
||||
rating = a.find("rating")
|
||||
strong = rating.find("strong")
|
||||
if strong is None:
|
||||
return None
|
||||
value = strong.get_text()
|
||||
if value[-1] == "?":
|
||||
value = value[:-1]
|
||||
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
except (AttributeError, ValueError):
|
||||
return None
|
||||
|
|
|
@ -7,31 +7,47 @@ DO $$
|
|||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
|
||||
-- This should match the order data is written in the app/__main__.py
|
||||
-- script.
|
||||
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
|
||||
( site TEXT
|
||||
, username TEXT
|
||||
, name TEXT
|
||||
, image_url TEXT
|
||||
, rapid TEXT
|
||||
, blitz TEXT
|
||||
, bullet TEXT
|
||||
);
|
||||
|
||||
SELECT format(
|
||||
$$COPY pg_temp.coach_scraper_export (data) from %L$$,
|
||||
$$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
|
||||
:export
|
||||
) \gexec
|
||||
|
||||
INSERT INTO coach_scraper.export
|
||||
( username
|
||||
, site
|
||||
( site
|
||||
, username
|
||||
, name
|
||||
, image_url
|
||||
, rapid
|
||||
, blitz
|
||||
, bullet
|
||||
)
|
||||
SELECT
|
||||
data->>'username',
|
||||
data->>'site',
|
||||
(data->>'rapid')::INT,
|
||||
(data->>'blitz')::INT,
|
||||
(data->>'bullet')::INT
|
||||
site,
|
||||
username,
|
||||
name,
|
||||
image_url,
|
||||
rapid::INT,
|
||||
blitz::INT,
|
||||
bullet::INT
|
||||
FROM
|
||||
pg_temp.coach_scraper_export
|
||||
ON CONFLICT
|
||||
(site, username)
|
||||
DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
image_url = EXCLUDED.image_url,
|
||||
rapid = EXCLUDED.rapid,
|
||||
blitz = EXCLUDED.blitz,
|
||||
bullet = EXCLUDED.bullet;
|
||||
|
|
25
sql/init.sql
25
sql/init.sql
|
@ -1,9 +1,13 @@
|
|||
CREATE SCHEMA IF NOT EXISTS coach_scraper;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS coach_scraper.export
|
||||
DROP TABLE IF EXISTS coach_scraper.export;
|
||||
|
||||
CREATE TABLE coach_scraper.export
|
||||
( id SERIAL PRIMARY KEY
|
||||
, site VARCHAR(16) NOT NULL
|
||||
, username VARCHAR(255) NOT NULL
|
||||
, name VARCHAR(255)
|
||||
, image_url TEXT
|
||||
, rapid INT
|
||||
, blitz INT
|
||||
, bullet INT
|
||||
|
@ -15,22 +19,3 @@ ON
|
|||
coach_scraper.export
|
||||
USING
|
||||
BTREE (site, username);
|
||||
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT 1
|
||||
FROM information_schema.constraint_column_usage
|
||||
WHERE table_schema = 'coach_scraper'
|
||||
AND table_name = 'export'
|
||||
AND constraint_name = 'site_username_unique'
|
||||
) THEN
|
||||
EXECUTE 'ALTER TABLE
|
||||
coach_scraper.export
|
||||
ADD CONSTRAINT
|
||||
site_username_unique
|
||||
UNIQUE USING INDEX
|
||||
site_username_unique';
|
||||
END IF;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
|
Loading…
Reference in New Issue