commit 1710e1aefa46bf1366924a5c7a7387964d97e330 Author: Joshua Potter Date: Mon Nov 27 13:09:40 2023 -0700 Initial commit. diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..817939c --- /dev/null +++ b/.envrc @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +if command -v git > /dev/null && on_git_branch; then + git config --local core.hooksPath .githooks/ +fi + +use flake diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000..64996ae --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -e + +filesToFormat=$( + git --no-pager diff --name-status --no-color --cached | \ + awk '$1 != "D" && $2 ~ /\.py$/ {print $NF}' +) + +for path in $filesToFormat +do + black --quiet "$path" + git add "$path" +done; diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6123955 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Directory used by `direnv` to hold `use flake`-generated profiles. +.direnv/ + +# A symlink produced by default when running `nix build`. +result diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d2aa3f --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Poetry Flake Template + +This is a template for constructing a working environment for Python (version +3.11.6) development. Packaging and dependency management relies on [poetry](https://python-poetry.org/) +(version 1.7.0). [direnv](https://direnv.net/) can be used to a launch a dev +shell upon entering this directory (refer to `.envrc`). Otherwise run via: +```bash +$> nix develop +``` + +## Language Server + +The [python-lsp-server](https://github.com/python-lsp/python-lsp-server) +(version v1.9.0) is included in this flake, along with the [python-lsp-black](https://github.com/python-lsp/python-lsp-black) +plugin for formatting purposes. `pylsp` is expected to be configured to use +[McCabe](https://github.com/PyCQA/mccabe), [pycodestyle](https://pycodestyle.pycqa.org/en/latest/), +and [pyflakes](https://github.com/PyCQA/pyflakes). Refer to your editor for +configuration details. + +## Formatting + +Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.html) +(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can +be used to format all `*.py` files prior to commit. Install via: +```bash +$> git config --local core.hooksPath .githooks/ +``` +If running [direnv](https://direnv.net/), this hook is installed automatically +when entering the directory. diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..f620865 --- /dev/null +++ b/default.nix @@ -0,0 +1,10 @@ +(import + ( + let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in + fetchTarball { + url = lock.nodes.flake-compat.locked.url or "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz"; + sha256 = lock.nodes.flake-compat.locked.narHash; + } + ) + { src = ./.; } +).defaultNix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..247f171 --- /dev/null +++ b/flake.lock @@ -0,0 +1,190 @@ +{ + "nodes": { + "flake-compat": { + "locked": { + "lastModified": 1696426674, + "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", + "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", + "revCount": 57, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "flake-utils_2": { + "inputs": { + "systems": "systems_2" + }, + "locked": { + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nix-github-actions": { + "inputs": { + "nixpkgs": [ + "poetry2nix", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1698974481, + "narHash": "sha256-yPncV9Ohdz1zPZxYHQf47S8S0VrnhV7nNhCawY46hDA=", + "owner": "nix-community", + "repo": "nix-github-actions", + "rev": "4bb5e752616262457bc7ca5882192a564c0472d2", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nix-github-actions", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1700794826, + "narHash": "sha256-RyJTnTNKhO0yqRpDISk03I/4A67/dp96YRxc86YOPgU=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "5a09cb4b393d58f9ed0d9ca1555016a8543c2ac8", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "poetry2nix": { + "inputs": { + "flake-utils": "flake-utils_2", + "nix-github-actions": "nix-github-actions", + "nixpkgs": [ + "nixpkgs" + ], + "systems": "systems_3", + "treefmt-nix": "treefmt-nix" + }, + "locked": { + "lastModified": 1701035916, + "narHash": "sha256-Cbe/3H9/z7vIXAeMr9m9iXs1gRxpE3w1mrx9aaxWtGU=", + "owner": "nix-community", + "repo": "poetry2nix", + "rev": "2bee2516bc054458b0cbca10b18e2ec63cea8726", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "poetry2nix", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-compat": "flake-compat", + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs", + "poetry2nix": "poetry2nix" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_2": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_3": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "id": "systems", + "type": "indirect" + } + }, + "treefmt-nix": { + "inputs": { + "nixpkgs": [ + "poetry2nix", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1699786194, + "narHash": "sha256-3h3EH1FXQkIeAuzaWB+nK0XK54uSD46pp+dMD3gAcB4=", + "owner": "numtide", + "repo": "treefmt-nix", + "rev": "e82f32aa7f06bbbd56d7b12186d555223dc399d1", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "treefmt-nix", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..d4d4f8e --- /dev/null +++ b/flake.nix @@ -0,0 +1,89 @@ +{ + description = '' + An opinionated poetry flake. + + This flake has been adapted from the `app` template found in: + https://github.com/nix-community/poetry2nix + + To generate a copy of this template elsewhere, run: + $> bootstrap poetry + ''; + + inputs = { + flake-compat.url = "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"; + flake-utils.url = "github:numtide/flake-utils"; + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + poetry2nix = { + url = "github:nix-community/poetry2nix"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + }; + + outputs = { self, nixpkgs, flake-utils, poetry2nix, ... }: + flake-utils.lib.eachDefaultSystem (system: + let + # See https://github.com/nix-community/poetry2nix/tree/master#api for + # more functions and examples. + pkgs = nixpkgs.legacyPackages.${system}; + + inherit + (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) + mkPoetryApplication + defaultPoetryOverrides; + + # https://github.com/nix-community/poetry2nix/blob/ec4364021900f8e0d425d901b6e6ff03cf201efb/docs/edgecases.md + # `poetry2nix`, by default, prefers building from source. To build + # certain dependencies, we need to augment its build dependencies by + # adding the corresponding build backend (e.g. `setuptools`). + # + # For example, you can write: + # ```nix + # pypkgs-build-requirements = { + # ... + # coolname = [ "setuptools" ]; + # ... + # }; + # ``` + # after encountering a build error like: + # + # > ModuleNotFoundError: No module named 'setuptools' + pypkgs-build-requirements = {}; + poetry2nix-overrides = defaultPoetryOverrides.extend (self: super: + builtins.mapAttrs (package: build-requirements: + (builtins.getAttr package super).overridePythonAttrs (old: { + buildInputs = + (old.buildInputs or []) ++ + (builtins.map (pkg: + if builtins.isString pkg then + builtins.getAttr pkg super + else + pkg) build-requirements); + }) + ) pypkgs-build-requirements + ); + in + { + packages = { + tmpl-app = mkPoetryApplication { + projectDir = ./.; + overrides = poetry2nix-overrides; + }; + default = self.packages.${system}.tmpl-app; + }; + + devShells.default = pkgs.mkShell { + inputsFrom = [ self.packages.${system}.tmpl-app ]; + packages = [ + pkgs.poetry + ] ++ (with pkgs.python311Packages; [ + black + debugpy + mccabe + pycodestyle + pyflakes + python-lsp-black + python-lsp-server + ]); + }; + }); +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..a77c3f1 --- /dev/null +++ b/main.py @@ -0,0 +1,99 @@ +import aiohttp +import asyncio +import os +import os.path +import random + +from bs4 import BeautifulSoup + + +# References to paths we use to save any scraped content. +DATA_COACH_LINKS = "data/coach_links.txt" +DATA_COACH_DIR = "data/coach/{}/{}" + + +async def scrape_coach_links(page_no): + """Scrape a single coach page listing.""" + links = [] + async with aiohttp.ClientSession() as session: + href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" + async with session.get(href) as response: + if response.status != 200: + print(f"Encountered {response.status} when retrieving {href}") + return + html = await response.text() + soup = BeautifulSoup(html, "html.parser") + members = soup.find_all("a", class_="members-categories-username") + for member in members: + links.append(member.get("href")) + + return links + + +async def scrape_all_coach_links(max_pages=62): + """Scans through chess.com/coaches for all member links.""" + if os.path.isfile(DATA_COACH_LINKS): + with open(DATA_COACH_LINKS, "r") as f: + return f.readlines() + for i in range(1, max_pages + 1): + # Nest the file context manager here so I can `tail -f` the file. + with open(DATA_COACH_LINKS, "a") as f: + links = await scrape_coach_links(i) + for link in links: + f.write(f"{link}\n") + await asyncio.sleep(random.randint(2, 5)) + return links + + +async def download_member_info(member_name, filename, href): + """Download member-specific content. + + @return: True if we downloaded content. False if the results already + existed locally. + """ + target = DATA_COACH_DIR.format(member_name, filename) + if os.path.isfile(target): + return False + with open(target, "w") as f: + async with aiohttp.ClientSession() as session: + async with session.get(href) as response: + if response.status != 200: + print(f"Encountered {response.status} when retrieving {href}") + return + f.write(await response.text()) + return True + + +async def main(): + links = await scrape_all_coach_links() + for link in links: + href = link.strip() + member_name = href[len("https://www.chess.com/member/") :] + downloaded = await asyncio.gather( + download_member_info( + member_name, + f"{member_name}.html", + href, + ), + download_member_info( + member_name, + "activity.json", + f"https://www.chess.com/callback/member/activity/{member_name}?page=1", + ), + download_member_info( + member_name, + "stats.json", + f"https://www.chess.com/callback/member/stats/{member_name}", + ), + ) + # Only want to sleep if the files didn't already exist. + if any(downloaded): + await asyncio.sleep(random.randint(2, 5)) + print(f"Downloaded {member_name}") + else: + print(f"Skipping {member_name}") + + +if __name__ == "__main__": + os.makedirs("data/coach", exist_ok=True) + asyncio.run(main()) diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..5dfbaad --- /dev/null +++ b/poetry.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +package = [] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "81b2fa642d7f2d1219cf80112ace12d689d053d81be7f7addb98144d56fc0fb2" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7e8633f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[tool.poetry] +name = "chesscom-scraper" +version = "0.1.0" +description = "" +authors = ["jrpotter2112@gmail.com"] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.11" +beautifulsoup4 = "^4.12.2" +aiohttp = "^3.8.6" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api"