parent
10801b560c
commit
bc2ffeae9d
32
README.md
32
README.md
|
@ -1,25 +1,29 @@
|
||||||
# chesscom-scraper
|
# coach-scraper
|
||||||
|
|
||||||
**Caution! Be careful running this script.**
|
**Caution! Be careful running this script.**
|
||||||
|
|
||||||
We intentionally delay each batch of requests by 3 seconds. Make sure any
|
We intentionally delay each batch of requests. Make sure any adjustments to this
|
||||||
adjustments to this script appropriately rate-limit.
|
script appropriately rate-limit.
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
This is a simple web scraper for [chess.com](https://www.chess.com/coaches)
|
This is a simple web scraper for coaches listed on:
|
||||||
coaches. The program searches for all listed coaches as well as specific
|
|
||||||
information about each of them (their profile, recent activity, and stats). The
|
* [chess.com](https://www.chess.com/coaches)
|
||||||
result will be found in a newly created `data` directory with the following
|
* [lichess.org](https://www.lichess.org/coach)
|
||||||
structure:
|
|
||||||
|
The program searches for coach usernames as well as specific information about
|
||||||
|
each of them (their profile, recent activity, and stats). The result will be
|
||||||
|
found in a newly created `data` directory with the following structure:
|
||||||
```
|
```
|
||||||
data
|
data
|
||||||
├── coach
|
└── <site>
|
||||||
│ ├── <username>
|
│ ├── coaches
|
||||||
│ │ ├── <username>.html
|
│ │ ├── <username>
|
||||||
│ │ ├── activity.json
|
│ │ │ ├── <username>.html
|
||||||
│ │ └── stats.json
|
│ │ │ ├── export.json
|
||||||
│ ├── ...
|
│ │ │ └── ...
|
||||||
|
│ │ ├── ...
|
||||||
└── pages
|
└── pages
|
||||||
├── <n>.txt
|
├── <n>.txt
|
||||||
├── ...
|
├── ...
|
||||||
|
|
|
@ -3,6 +3,7 @@ import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
from app.chesscom import Scraper as ChesscomScraper
|
from app.chesscom import Scraper as ChesscomScraper
|
||||||
|
from app.lichess import Scraper as LichessScraper
|
||||||
from app.scraper import Site
|
from app.scraper import Site
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,6 +19,7 @@ async def run():
|
||||||
required=True,
|
required=True,
|
||||||
choices=[
|
choices=[
|
||||||
Site.CHESSCOM.value,
|
Site.CHESSCOM.value,
|
||||||
|
Site.LICHESS.value,
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -27,6 +29,8 @@ async def run():
|
||||||
) as session:
|
) as session:
|
||||||
if args.site == Site.CHESSCOM.value:
|
if args.site == Site.CHESSCOM.value:
|
||||||
scraper = ChesscomScraper(session)
|
scraper = ChesscomScraper(session)
|
||||||
|
elif args.site == Site.LICHESS.value:
|
||||||
|
scraper = LichessScraper(session)
|
||||||
|
|
||||||
await scraper.scrape()
|
await scraper.scrape()
|
||||||
|
|
||||||
|
|
|
@ -179,15 +179,12 @@ class Scraper(BaseScraper):
|
||||||
|
|
||||||
async def export(self, username: str) -> Export:
|
async def export(self, username: str) -> Export:
|
||||||
"""Transform coach-specific data into uniform format."""
|
"""Transform coach-specific data into uniform format."""
|
||||||
stat_export: Export = {}
|
export: Export = {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
with open(self.path_coach_file(username, "stats.json"), "r") as f:
|
||||||
stat_export = self._load_stats_json(json.load(f))
|
export.update(self._load_stats_json(json.load(f)))
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
export: Export = {
|
|
||||||
"fide_rapid": None,
|
|
||||||
}
|
|
||||||
export.update(stat_export)
|
|
||||||
return export
|
return export
|
||||||
|
|
|
@ -0,0 +1,128 @@
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
from app.scraper import AnsiColor, BaseScraper, Export, Site
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
# The number of pages we will at most iterate through. This number was
|
||||||
|
# determined by going to https://lichess.org/coach/all/all/alphabetical
|
||||||
|
# and traversing to the last page.
|
||||||
|
MAX_PAGES = 162
|
||||||
|
|
||||||
|
# How long to wait between each network request.
|
||||||
|
SLEEP_SECS = 5
|
||||||
|
|
||||||
|
|
||||||
|
class Scraper(BaseScraper):
|
||||||
|
def __init__(self, session: aiohttp.ClientSession):
|
||||||
|
super().__init__(site=Site.LICHESS.value, session=session)
|
||||||
|
|
||||||
|
async def download_usernames(self) -> List[str]:
|
||||||
|
"""Scan through lichess.org/coach for all coaches' usernames.
|
||||||
|
|
||||||
|
@return
|
||||||
|
The complete list of scraped usernames across every coach listing
|
||||||
|
page.
|
||||||
|
"""
|
||||||
|
usernames = []
|
||||||
|
for page_no in range(1, MAX_PAGES + 1):
|
||||||
|
filepath = self.path_page_file(page_no)
|
||||||
|
try:
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Reading file "),
|
||||||
|
(AnsiColor.DATA, filepath),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
usernames.extend([line.strip() for line in f.readlines()])
|
||||||
|
except FileNotFoundError:
|
||||||
|
page_usernames = await self._scrape_page(page_no)
|
||||||
|
if not page_usernames:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.ERROR, "[ERROR]"),
|
||||||
|
(None, ": Could not scrape page "),
|
||||||
|
(AnsiColor.DATA, str(page_no)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
with open(filepath, "w") as f:
|
||||||
|
for username in page_usernames:
|
||||||
|
f.write(f"{username}\n")
|
||||||
|
usernames.extend(page_usernames)
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.INFO, "[INFO]"),
|
||||||
|
(None, ": Downloaded page "),
|
||||||
|
(AnsiColor.DATA, filepath),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
await asyncio.sleep(SLEEP_SECS)
|
||||||
|
|
||||||
|
return usernames
|
||||||
|
|
||||||
|
async def _scrape_page(self, page_no: int):
|
||||||
|
"""Scan through lichess.org/coach/.../?page=<n> for all coaches'
|
||||||
|
usernames.
|
||||||
|
|
||||||
|
@param page_no
|
||||||
|
The page consisting of at most 10 coaches (at the time of writing)
|
||||||
|
whose usernames are to be scraped.
|
||||||
|
@return
|
||||||
|
The list of scraped usernames on the specified coach listing page.
|
||||||
|
"""
|
||||||
|
url = f"https://lichess.org/coach/all/all/alphabetical?page={page_no}"
|
||||||
|
response, status_code = await self.request(url)
|
||||||
|
if response is None:
|
||||||
|
self.log(
|
||||||
|
[
|
||||||
|
(AnsiColor.ERROR, "[ERROR]"),
|
||||||
|
(None, ": Received status "),
|
||||||
|
(AnsiColor.DATA, f"{status_code} "),
|
||||||
|
(None, "when downloading page "),
|
||||||
|
(AnsiColor.DATA, str(page_no)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
usernames = []
|
||||||
|
soup = BeautifulSoup(response, "html.parser")
|
||||||
|
members = soup.find_all("article", class_="coach-widget")
|
||||||
|
for member in members:
|
||||||
|
anchor = member.find("a", class_="overlay")
|
||||||
|
if anchor:
|
||||||
|
href = anchor.get("href")
|
||||||
|
username = href[len("/coach/") :]
|
||||||
|
usernames.append(username)
|
||||||
|
|
||||||
|
return usernames
|
||||||
|
|
||||||
|
async def download_profile(self, username: str):
|
||||||
|
"""For each coach, download coach-specific data.
|
||||||
|
|
||||||
|
@param username
|
||||||
|
The coach username corresponding to the downloaded files.
|
||||||
|
"""
|
||||||
|
filepath = self.path_coach_file(username, f"{username}.html")
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
return False
|
||||||
|
|
||||||
|
response, _unused_status = await self.request(
|
||||||
|
url=f"https://lichess.org/coach/{username}"
|
||||||
|
)
|
||||||
|
if response is not None:
|
||||||
|
with open(filepath, "w") as f:
|
||||||
|
f.write(response)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def export(self, username: str) -> Export:
|
||||||
|
"""Transform coach-specific data into uniform format."""
|
||||||
|
export: Export = {}
|
||||||
|
return export
|
Loading…
Reference in New Issue