Transition to a CSV; Postgres can handle that better.
parent
ec94a16140
commit
63764a22c4
27
README.md
27
README.md
|
@ -29,17 +29,19 @@ data
|
||||||
├── ...
|
├── ...
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Quickstart
|
||||||
|
|
||||||
If you have nix available, run:
|
If you have nix available, run:
|
||||||
```bash
|
```bash
|
||||||
$ nix run . -- --user-agent <your-email> -s <site>
|
$ nix run . -- --user-agent <your-email> -s <site> [-s <site> ...]
|
||||||
```
|
```
|
||||||
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
If not, ensure you have [poetry](https://python-poetry.org/) on your machine and
|
||||||
instead run the following:
|
instead run the following:
|
||||||
```bash
|
```bash
|
||||||
$ poetry run python3 -m app -u <your-email> -s <site>
|
$ poetry run python3 -m app -u <your-email> -s <site> [-s <site> ...]
|
||||||
```
|
```
|
||||||
|
After running (this may take several hours), a new CSV will be generated at
|
||||||
|
`data/export.csv` containing all scraped content from the specified `<site>`s.
|
||||||
|
|
||||||
## Database
|
## Database
|
||||||
|
|
||||||
|
@ -70,13 +72,16 @@ To load all exported coach data into a local Postgres instance, use the provided
|
||||||
```bash
|
```bash
|
||||||
$ psql -h @scraper -f sql/init.sql
|
$ psql -h @scraper -f sql/init.sql
|
||||||
```
|
```
|
||||||
Next, concatenate all exported content and dump into the newly created table:
|
Next, dump exported data into the newly created table:
|
||||||
```bash
|
```bash
|
||||||
$ cat data/{chesscom,lichess}/export.json > data/export.json
|
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.csv'"
|
||||||
$ psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
|
```
|
||||||
|
Re-running the `sql/export.sql` script will create a backup of the
|
||||||
|
`coach_scraper.export` table. It will then upsert the scraped data. You can view
|
||||||
|
all backups from the `psql` console like so:
|
||||||
|
```
|
||||||
|
postgres=# \dt coach_scraper.export*
|
||||||
```
|
```
|
||||||
Re-running will automatically create backups and replace the coach data found
|
|
||||||
in `coach_scraper.export`.
|
|
||||||
|
|
||||||
### E2E
|
### E2E
|
||||||
|
|
||||||
|
@ -85,10 +90,8 @@ necessary to scrape coach data from our chess website and dump the results into
|
||||||
the database in one fell swoop. Assuming our database is open with a socket
|
the database in one fell swoop. Assuming our database is open with a socket
|
||||||
connection available at `@scraper`:
|
connection available at `@scraper`:
|
||||||
```bash
|
```bash
|
||||||
nix run . -- --user-agent <your-email> -s chesscom -s lichess
|
$ nix run . -- --user-agent <your-email> -s chesscom -s lichess
|
||||||
cat data/{chesscom,lichess}/export.json > data/export.json
|
$ psql -h @scraper -f sql/init.sql -f sql/export.sql -v export="'$PWD/data/export.csv'"
|
||||||
psql -h @scraper -f sql/init.sql
|
|
||||||
psql -h @scraper -f sql/export.sql -v export="'$PWD/data/export.json'"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import csv
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
@ -32,25 +33,34 @@ async def run():
|
||||||
async with aiohttp.ClientSession(
|
async with aiohttp.ClientSession(
|
||||||
headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
|
headers={"User-Agent": f"BoardWise coach-scraper ({args.user_agent})"}
|
||||||
) as session:
|
) as session:
|
||||||
for site in set(args.site):
|
with open("data/export.csv", "w") as f:
|
||||||
scraper, exporter_cls = None, None
|
writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||||
|
for site in set(args.site):
|
||||||
|
scraper, exporter_cls = None, None
|
||||||
|
|
||||||
if site == Site.CHESSCOM.value:
|
if site == Site.CHESSCOM.value:
|
||||||
scraper = ChesscomScraper(session)
|
scraper = ChesscomScraper(session)
|
||||||
exporter_cls = ChesscomExporter
|
exporter_cls = ChesscomExporter
|
||||||
elif site == Site.LICHESS.value:
|
elif site == Site.LICHESS.value:
|
||||||
scraper = LichessScraper(session)
|
scraper = LichessScraper(session)
|
||||||
exporter_cls = LichessExporter
|
exporter_cls = LichessExporter
|
||||||
|
|
||||||
# Write out each coach data into NDJSON file.
|
usernames = await scraper.scrape()
|
||||||
dump = []
|
for username in usernames:
|
||||||
usernames = await scraper.scrape()
|
export = exporter_cls(username).export()
|
||||||
for username in usernames:
|
writer.writerow(
|
||||||
export = exporter_cls(username).export()
|
[
|
||||||
dump.append(f"{json.dumps(export)}\n")
|
# This should match the order data is loaded in the
|
||||||
|
# sql/export.sql script.
|
||||||
with open(scraper.path_site_file("export.json"), "w") as f:
|
export["site"],
|
||||||
f.writelines(dump)
|
export["username"],
|
||||||
|
export.get("name", ""),
|
||||||
|
export.get("image_url", ""),
|
||||||
|
export.get("rapid", ""),
|
||||||
|
export.get("blitz", ""),
|
||||||
|
export.get("bullet", ""),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os.path
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, SoupStrainer
|
||||||
|
|
||||||
from app.exporter import BaseExporter
|
from app.exporter import BaseExporter
|
||||||
from app.repo import AnsiColor, Site
|
from app.repo import AnsiColor, Site
|
||||||
|
@ -172,10 +172,27 @@ class Scraper(BaseScraper):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _profile_filter(elem, attrs):
|
||||||
|
"""Includes only relevant segments of the `{username}.html` file."""
|
||||||
|
if "profile-header-info" in attrs.get("class", ""):
|
||||||
|
return True
|
||||||
|
if "profile-card-info" in attrs.get("class", ""):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class Exporter(BaseExporter):
|
class Exporter(BaseExporter):
|
||||||
def __init__(self, username: str):
|
def __init__(self, username: str):
|
||||||
super().__init__(site=Site.CHESSCOM.value, username=username)
|
super().__init__(site=Site.CHESSCOM.value, username=username)
|
||||||
|
|
||||||
|
self.profile_soup = None
|
||||||
|
try:
|
||||||
|
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
|
||||||
|
self.profile_soup = BeautifulSoup(
|
||||||
|
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
self.stats_json = {}
|
self.stats_json = {}
|
||||||
try:
|
try:
|
||||||
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
||||||
|
@ -185,6 +202,22 @@ class Exporter(BaseExporter):
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def export_name(self) -> Union[str, None]:
|
||||||
|
try:
|
||||||
|
name = self.profile_soup.find("div", class_="profile-card-name")
|
||||||
|
return name.get_text().strip()
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def export_image_url(self) -> Union[str, None]:
|
||||||
|
try:
|
||||||
|
div = self.profile_soup.find("div", class_="profile-header-avatar")
|
||||||
|
src = div.find("img").get("src", "")
|
||||||
|
if "images.chesscomfiles.com" in src:
|
||||||
|
return src
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
|
||||||
def export_rapid(self) -> Union[int, None]:
|
def export_rapid(self) -> Union[int, None]:
|
||||||
return self.stats_json.get("rapid", {}).get("rating")
|
return self.stats_json.get("rapid", {}).get("rating")
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
|
@ -6,6 +6,13 @@ from app.repo import AnsiColor, Repo
|
||||||
|
|
||||||
|
|
||||||
class Export(TypedDict, total=False):
|
class Export(TypedDict, total=False):
|
||||||
|
# The (site, username) make up a unique key for each coach.
|
||||||
|
site: str
|
||||||
|
username: str
|
||||||
|
# The coach's real name.
|
||||||
|
name: str
|
||||||
|
# The profile image used on the source site.
|
||||||
|
image_url: str
|
||||||
# The coach's rapid rating relative to the site they were sourced from.
|
# The coach's rapid rating relative to the site they were sourced from.
|
||||||
rapid: int
|
rapid: int
|
||||||
# The coach's blitz rating relative to the site they were sourced from.
|
# The coach's blitz rating relative to the site they were sourced from.
|
||||||
|
@ -14,10 +21,9 @@ class Export(TypedDict, total=False):
|
||||||
bullet: int
|
bullet: int
|
||||||
|
|
||||||
|
|
||||||
def _insert(export: Export, key: str, value: any):
|
def _insert(export: Export, key: str, value: Any):
|
||||||
if value is None:
|
if value is not None:
|
||||||
return
|
export[key] = value
|
||||||
export[key] = value
|
|
||||||
|
|
||||||
|
|
||||||
class BaseExporter(Repo):
|
class BaseExporter(Repo):
|
||||||
|
@ -25,6 +31,12 @@ class BaseExporter(Repo):
|
||||||
super().__init__(site)
|
super().__init__(site)
|
||||||
self.username = username
|
self.username = username
|
||||||
|
|
||||||
|
def export_name(self) -> Union[str, None]:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def export_image_url(self) -> Union[str, None]:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def export_rapid(self) -> Union[int, None]:
|
def export_rapid(self) -> Union[int, None]:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -40,6 +52,8 @@ class BaseExporter(Repo):
|
||||||
|
|
||||||
_insert(export, "site", self.site)
|
_insert(export, "site", self.site)
|
||||||
_insert(export, "username", self.username)
|
_insert(export, "username", self.username)
|
||||||
|
_insert(export, "name", self.export_name())
|
||||||
|
_insert(export, "image_url", self.export_image_url())
|
||||||
_insert(export, "rapid", self.export_rapid())
|
_insert(export, "rapid", self.export_rapid())
|
||||||
_insert(export, "blitz", self.export_blitz())
|
_insert(export, "blitz", self.export_blitz())
|
||||||
_insert(export, "bullet", self.export_bullet())
|
_insert(export, "bullet", self.export_bullet())
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
from typing import List
|
from typing import List, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from bs4 import BeautifulSoup, SoupStrainer
|
from bs4 import BeautifulSoup, SoupStrainer
|
||||||
|
@ -97,9 +97,9 @@ class Scraper(BaseScraper):
|
||||||
soup = BeautifulSoup(response, "lxml")
|
soup = BeautifulSoup(response, "lxml")
|
||||||
members = soup.find_all("article", class_="coach-widget")
|
members = soup.find_all("article", class_="coach-widget")
|
||||||
for member in members:
|
for member in members:
|
||||||
anchor = member.find("a", class_="overlay")
|
a = member.find("a", class_="overlay")
|
||||||
if anchor:
|
if a:
|
||||||
href = anchor.get("href")
|
href = a.get("href")
|
||||||
username = href[len("/coach/") :]
|
username = href[len("/coach/") :]
|
||||||
usernames.append(username)
|
usernames.append(username)
|
||||||
|
|
||||||
|
@ -163,8 +163,16 @@ class Scraper(BaseScraper):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _profile_filter(elem, attrs):
|
||||||
|
"""Includes only relevant segments of the `{username}.html` file."""
|
||||||
|
if "coach-widget" in attrs.get("class", ""):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _stats_filter(elem, attrs):
|
def _stats_filter(elem, attrs):
|
||||||
"""Includes only relevant segments of the `stats.html` file."""
|
"""Includes only relevant segments of the `stats.html` file."""
|
||||||
|
if "profile-side" in attrs.get("class", ""):
|
||||||
|
return True
|
||||||
if "sub-ratings" in attrs.get("class", ""):
|
if "sub-ratings" in attrs.get("class", ""):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -173,43 +181,59 @@ class Exporter(BaseExporter):
|
||||||
def __init__(self, username: str):
|
def __init__(self, username: str):
|
||||||
super().__init__(site=Site.LICHESS.value, username=username)
|
super().__init__(site=Site.LICHESS.value, username=username)
|
||||||
|
|
||||||
self.stats_soup = None
|
self.profile_soup = None
|
||||||
try:
|
try:
|
||||||
with open(self.path_coach_file(username, "stats.html"), "r") as f:
|
with open(self.path_coach_file(username, f"{username}.html"), "r") as f:
|
||||||
stats_strainer = SoupStrainer(_stats_filter)
|
self.profile_soup = BeautifulSoup(
|
||||||
self.stats_soup = BeautifulSoup(
|
f.read(), "lxml", parse_only=SoupStrainer(_profile_filter)
|
||||||
f.read(), "lxml", parse_only=stats_strainer
|
|
||||||
)
|
)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def export_rapid(self):
|
self.stats_soup = None
|
||||||
|
try:
|
||||||
|
with open(self.path_coach_file(username, "stats.html"), "r") as f:
|
||||||
|
self.stats_soup = BeautifulSoup(
|
||||||
|
f.read(), "lxml", parse_only=SoupStrainer(_stats_filter)
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def export_name(self) -> Union[str, None]:
|
||||||
|
try:
|
||||||
|
profile_side = self.stats_soup.find("div", class_="profile-side")
|
||||||
|
user_infos = profile_side.find("div", class_="user-infos")
|
||||||
|
name = user_infos.find("strong", class_="name")
|
||||||
|
return name.get_text().strip()
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def export_image_url(self) -> Union[str, None]:
|
||||||
|
try:
|
||||||
|
picture = self.profile_soup.find("img", class_="picture")
|
||||||
|
src = picture.get("src", "")
|
||||||
|
if "image.lichess1.org" in src:
|
||||||
|
return src
|
||||||
|
except AttributeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def export_rapid(self) -> Union[int, None]:
|
||||||
return self._find_rating("rapid")
|
return self._find_rating("rapid")
|
||||||
|
|
||||||
def export_blitz(self):
|
def export_blitz(self) -> Union[int, None]:
|
||||||
return self._find_rating("blitz")
|
return self._find_rating("blitz")
|
||||||
|
|
||||||
def export_bullet(self):
|
def export_bullet(self) -> Union[int, None]:
|
||||||
return self._find_rating("bullet")
|
return self._find_rating("bullet")
|
||||||
|
|
||||||
def _find_rating(self, name):
|
def _find_rating(self, name) -> Union[int, None]:
|
||||||
if self.stats_soup is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
anchor = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
|
|
||||||
if anchor is None:
|
|
||||||
return None
|
|
||||||
rating = anchor.find("rating")
|
|
||||||
if rating is None:
|
|
||||||
return None
|
|
||||||
strong = rating.find("strong")
|
|
||||||
if strong is None:
|
|
||||||
return None
|
|
||||||
value = strong.get_text()
|
|
||||||
if value[-1] == "?":
|
|
||||||
value = value[:-1]
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
a = self.stats_soup.find("a", href=f"/@/{self.username}/perf/{name}")
|
||||||
|
rating = a.find("rating")
|
||||||
|
strong = rating.find("strong")
|
||||||
|
value = strong.get_text()
|
||||||
|
if value[-1] == "?":
|
||||||
|
value = value[:-1]
|
||||||
return int(value)
|
return int(value)
|
||||||
except ValueError:
|
except (AttributeError, ValueError):
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -7,31 +7,47 @@ DO $$
|
||||||
END;
|
END;
|
||||||
$$ LANGUAGE plpgsql;
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export (data JSONB);
|
-- This should match the order data is written in the app/__main__.py
|
||||||
|
-- script.
|
||||||
|
CREATE TEMPORARY TABLE pg_temp.coach_scraper_export
|
||||||
|
( site TEXT
|
||||||
|
, username TEXT
|
||||||
|
, name TEXT
|
||||||
|
, image_url TEXT
|
||||||
|
, rapid TEXT
|
||||||
|
, blitz TEXT
|
||||||
|
, bullet TEXT
|
||||||
|
);
|
||||||
|
|
||||||
SELECT format(
|
SELECT format(
|
||||||
$$COPY pg_temp.coach_scraper_export (data) from %L$$,
|
$$COPY pg_temp.coach_scraper_export FROM %L WITH (FORMAT CSV)$$,
|
||||||
:export
|
:export
|
||||||
) \gexec
|
) \gexec
|
||||||
|
|
||||||
INSERT INTO coach_scraper.export
|
INSERT INTO coach_scraper.export
|
||||||
( username
|
( site
|
||||||
, site
|
, username
|
||||||
|
, name
|
||||||
|
, image_url
|
||||||
, rapid
|
, rapid
|
||||||
, blitz
|
, blitz
|
||||||
, bullet
|
, bullet
|
||||||
)
|
)
|
||||||
SELECT
|
SELECT
|
||||||
data->>'username',
|
site,
|
||||||
data->>'site',
|
username,
|
||||||
(data->>'rapid')::INT,
|
name,
|
||||||
(data->>'blitz')::INT,
|
image_url,
|
||||||
(data->>'bullet')::INT
|
rapid::INT,
|
||||||
|
blitz::INT,
|
||||||
|
bullet::INT
|
||||||
FROM
|
FROM
|
||||||
pg_temp.coach_scraper_export
|
pg_temp.coach_scraper_export
|
||||||
ON CONFLICT
|
ON CONFLICT
|
||||||
(site, username)
|
(site, username)
|
||||||
DO UPDATE SET
|
DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
image_url = EXCLUDED.image_url,
|
||||||
rapid = EXCLUDED.rapid,
|
rapid = EXCLUDED.rapid,
|
||||||
blitz = EXCLUDED.blitz,
|
blitz = EXCLUDED.blitz,
|
||||||
bullet = EXCLUDED.bullet;
|
bullet = EXCLUDED.bullet;
|
||||||
|
|
25
sql/init.sql
25
sql/init.sql
|
@ -1,9 +1,13 @@
|
||||||
CREATE SCHEMA IF NOT EXISTS coach_scraper;
|
CREATE SCHEMA IF NOT EXISTS coach_scraper;
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS coach_scraper.export
|
DROP TABLE IF EXISTS coach_scraper.export;
|
||||||
|
|
||||||
|
CREATE TABLE coach_scraper.export
|
||||||
( id SERIAL PRIMARY KEY
|
( id SERIAL PRIMARY KEY
|
||||||
, site VARCHAR(16) NOT NULL
|
, site VARCHAR(16) NOT NULL
|
||||||
, username VARCHAR(255) NOT NULL
|
, username VARCHAR(255) NOT NULL
|
||||||
|
, name VARCHAR(255)
|
||||||
|
, image_url TEXT
|
||||||
, rapid INT
|
, rapid INT
|
||||||
, blitz INT
|
, blitz INT
|
||||||
, bullet INT
|
, bullet INT
|
||||||
|
@ -15,22 +19,3 @@ ON
|
||||||
coach_scraper.export
|
coach_scraper.export
|
||||||
USING
|
USING
|
||||||
BTREE (site, username);
|
BTREE (site, username);
|
||||||
|
|
||||||
DO $$
|
|
||||||
BEGIN
|
|
||||||
IF NOT EXISTS (
|
|
||||||
SELECT 1
|
|
||||||
FROM information_schema.constraint_column_usage
|
|
||||||
WHERE table_schema = 'coach_scraper'
|
|
||||||
AND table_name = 'export'
|
|
||||||
AND constraint_name = 'site_username_unique'
|
|
||||||
) THEN
|
|
||||||
EXECUTE 'ALTER TABLE
|
|
||||||
coach_scraper.export
|
|
||||||
ADD CONSTRAINT
|
|
||||||
site_username_unique
|
|
||||||
UNIQUE USING INDEX
|
|
||||||
site_username_unique';
|
|
||||||
END IF;
|
|
||||||
END;
|
|
||||||
$$ LANGUAGE plpgsql;
|
|
||||||
|
|
Loading…
Reference in New Issue