From 1710e1aefa46bf1366924a5c7a7387964d97e330 Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Mon, 27 Nov 2023 13:09:40 -0700 Subject: [PATCH] Initial commit. --- .envrc | 7 ++ .githooks/pre-commit | 13 +++ .gitignore | 5 ++ README.md | 29 +++++++ default.nix | 10 +++ flake.lock | 190 +++++++++++++++++++++++++++++++++++++++++++ flake.nix | 89 ++++++++++++++++++++ main.py | 99 ++++++++++++++++++++++ poetry.lock | 7 ++ pyproject.toml | 15 ++++ 10 files changed, 464 insertions(+) create mode 100644 .envrc create mode 100755 .githooks/pre-commit create mode 100644 .gitignore create mode 100644 README.md create mode 100644 default.nix create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 main.py create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..817939c --- /dev/null +++ b/.envrc @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +if command -v git > /dev/null && on_git_branch; then + git config --local core.hooksPath .githooks/ +fi + +use flake diff --git a/.githooks/pre-commit b/.githooks/pre-commit new file mode 100755 index 0000000..64996ae --- /dev/null +++ b/.githooks/pre-commit @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -e + +filesToFormat=$( + git --no-pager diff --name-status --no-color --cached | \ + awk '$1 != "D" && $2 ~ /\.py$/ {print $NF}' +) + +for path in $filesToFormat +do + black --quiet "$path" + git add "$path" +done; diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6123955 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Directory used by `direnv` to hold `use flake`-generated profiles. +.direnv/ + +# A symlink produced by default when running `nix build`. +result diff --git a/README.md b/README.md new file mode 100644 index 0000000..8d2aa3f --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# Poetry Flake Template + +This is a template for constructing a working environment for Python (version +3.11.6) development. Packaging and dependency management relies on [poetry](https://python-poetry.org/) +(version 1.7.0). [direnv](https://direnv.net/) can be used to a launch a dev +shell upon entering this directory (refer to `.envrc`). Otherwise run via: +```bash +$> nix develop +``` + +## Language Server + +The [python-lsp-server](https://github.com/python-lsp/python-lsp-server) +(version v1.9.0) is included in this flake, along with the [python-lsp-black](https://github.com/python-lsp/python-lsp-black) +plugin for formatting purposes. `pylsp` is expected to be configured to use +[McCabe](https://github.com/PyCQA/mccabe), [pycodestyle](https://pycodestyle.pycqa.org/en/latest/), +and [pyflakes](https://github.com/PyCQA/pyflakes). Refer to your editor for +configuration details. + +## Formatting + +Formatting depends on the [black](https://black.readthedocs.io/en/stable/index.html) +(version 23.9.1) tool. A `pre-commit` hook is included in `.githooks` that can +be used to format all `*.py` files prior to commit. Install via: +```bash +$> git config --local core.hooksPath .githooks/ +``` +If running [direnv](https://direnv.net/), this hook is installed automatically +when entering the directory. diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..f620865 --- /dev/null +++ b/default.nix @@ -0,0 +1,10 @@ +(import + ( + let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in + fetchTarball { + url = lock.nodes.flake-compat.locked.url or "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz"; + sha256 = lock.nodes.flake-compat.locked.narHash; + } + ) + { src = ./.; } +).defaultNix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..247f171 --- /dev/null +++ b/flake.lock @@ -0,0 +1,190 @@ +{ + "nodes": { + "flake-compat": { + "locked": { + "lastModified": 1696426674, + "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", + "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", + "revCount": 57, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/edolstra/flake-compat/1.0.1/018afb31-abd1-7bff-a5e4-cff7e18efb7a/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "flake-utils_2": { + "inputs": { + "systems": "systems_2" + }, + "locked": { + "lastModified": 1694529238, + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nix-github-actions": { + "inputs": { + "nixpkgs": [ + "poetry2nix", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1698974481, + "narHash": "sha256-yPncV9Ohdz1zPZxYHQf47S8S0VrnhV7nNhCawY46hDA=", + "owner": "nix-community", + "repo": "nix-github-actions", + "rev": "4bb5e752616262457bc7ca5882192a564c0472d2", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nix-github-actions", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1700794826, + "narHash": "sha256-RyJTnTNKhO0yqRpDISk03I/4A67/dp96YRxc86YOPgU=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "5a09cb4b393d58f9ed0d9ca1555016a8543c2ac8", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "poetry2nix": { + "inputs": { + "flake-utils": "flake-utils_2", + "nix-github-actions": "nix-github-actions", + "nixpkgs": [ + "nixpkgs" + ], + "systems": "systems_3", + "treefmt-nix": "treefmt-nix" + }, + "locked": { + "lastModified": 1701035916, + "narHash": "sha256-Cbe/3H9/z7vIXAeMr9m9iXs1gRxpE3w1mrx9aaxWtGU=", + "owner": "nix-community", + "repo": "poetry2nix", + "rev": "2bee2516bc054458b0cbca10b18e2ec63cea8726", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "poetry2nix", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-compat": "flake-compat", + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs", + "poetry2nix": "poetry2nix" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_2": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_3": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "id": "systems", + "type": "indirect" + } + }, + "treefmt-nix": { + "inputs": { + "nixpkgs": [ + "poetry2nix", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1699786194, + "narHash": "sha256-3h3EH1FXQkIeAuzaWB+nK0XK54uSD46pp+dMD3gAcB4=", + "owner": "numtide", + "repo": "treefmt-nix", + "rev": "e82f32aa7f06bbbd56d7b12186d555223dc399d1", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "treefmt-nix", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..d4d4f8e --- /dev/null +++ b/flake.nix @@ -0,0 +1,89 @@ +{ + description = '' + An opinionated poetry flake. + + This flake has been adapted from the `app` template found in: + https://github.com/nix-community/poetry2nix + + To generate a copy of this template elsewhere, run: + $> bootstrap poetry + ''; + + inputs = { + flake-compat.url = "https://flakehub.com/f/edolstra/flake-compat/1.tar.gz"; + flake-utils.url = "github:numtide/flake-utils"; + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + poetry2nix = { + url = "github:nix-community/poetry2nix"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + }; + + outputs = { self, nixpkgs, flake-utils, poetry2nix, ... }: + flake-utils.lib.eachDefaultSystem (system: + let + # See https://github.com/nix-community/poetry2nix/tree/master#api for + # more functions and examples. + pkgs = nixpkgs.legacyPackages.${system}; + + inherit + (poetry2nix.lib.mkPoetry2Nix { inherit pkgs; }) + mkPoetryApplication + defaultPoetryOverrides; + + # https://github.com/nix-community/poetry2nix/blob/ec4364021900f8e0d425d901b6e6ff03cf201efb/docs/edgecases.md + # `poetry2nix`, by default, prefers building from source. To build + # certain dependencies, we need to augment its build dependencies by + # adding the corresponding build backend (e.g. `setuptools`). + # + # For example, you can write: + # ```nix + # pypkgs-build-requirements = { + # ... + # coolname = [ "setuptools" ]; + # ... + # }; + # ``` + # after encountering a build error like: + # + # > ModuleNotFoundError: No module named 'setuptools' + pypkgs-build-requirements = {}; + poetry2nix-overrides = defaultPoetryOverrides.extend (self: super: + builtins.mapAttrs (package: build-requirements: + (builtins.getAttr package super).overridePythonAttrs (old: { + buildInputs = + (old.buildInputs or []) ++ + (builtins.map (pkg: + if builtins.isString pkg then + builtins.getAttr pkg super + else + pkg) build-requirements); + }) + ) pypkgs-build-requirements + ); + in + { + packages = { + tmpl-app = mkPoetryApplication { + projectDir = ./.; + overrides = poetry2nix-overrides; + }; + default = self.packages.${system}.tmpl-app; + }; + + devShells.default = pkgs.mkShell { + inputsFrom = [ self.packages.${system}.tmpl-app ]; + packages = [ + pkgs.poetry + ] ++ (with pkgs.python311Packages; [ + black + debugpy + mccabe + pycodestyle + pyflakes + python-lsp-black + python-lsp-server + ]); + }; + }); +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..a77c3f1 --- /dev/null +++ b/main.py @@ -0,0 +1,99 @@ +import aiohttp +import asyncio +import os +import os.path +import random + +from bs4 import BeautifulSoup + + +# References to paths we use to save any scraped content. +DATA_COACH_LINKS = "data/coach_links.txt" +DATA_COACH_DIR = "data/coach/{}/{}" + + +async def scrape_coach_links(page_no): + """Scrape a single coach page listing.""" + links = [] + async with aiohttp.ClientSession() as session: + href = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" + async with session.get(href) as response: + if response.status != 200: + print(f"Encountered {response.status} when retrieving {href}") + return + html = await response.text() + soup = BeautifulSoup(html, "html.parser") + members = soup.find_all("a", class_="members-categories-username") + for member in members: + links.append(member.get("href")) + + return links + + +async def scrape_all_coach_links(max_pages=62): + """Scans through chess.com/coaches for all member links.""" + if os.path.isfile(DATA_COACH_LINKS): + with open(DATA_COACH_LINKS, "r") as f: + return f.readlines() + for i in range(1, max_pages + 1): + # Nest the file context manager here so I can `tail -f` the file. + with open(DATA_COACH_LINKS, "a") as f: + links = await scrape_coach_links(i) + for link in links: + f.write(f"{link}\n") + await asyncio.sleep(random.randint(2, 5)) + return links + + +async def download_member_info(member_name, filename, href): + """Download member-specific content. + + @return: True if we downloaded content. False if the results already + existed locally. + """ + target = DATA_COACH_DIR.format(member_name, filename) + if os.path.isfile(target): + return False + with open(target, "w") as f: + async with aiohttp.ClientSession() as session: + async with session.get(href) as response: + if response.status != 200: + print(f"Encountered {response.status} when retrieving {href}") + return + f.write(await response.text()) + return True + + +async def main(): + links = await scrape_all_coach_links() + for link in links: + href = link.strip() + member_name = href[len("https://www.chess.com/member/") :] + downloaded = await asyncio.gather( + download_member_info( + member_name, + f"{member_name}.html", + href, + ), + download_member_info( + member_name, + "activity.json", + f"https://www.chess.com/callback/member/activity/{member_name}?page=1", + ), + download_member_info( + member_name, + "stats.json", + f"https://www.chess.com/callback/member/stats/{member_name}", + ), + ) + # Only want to sleep if the files didn't already exist. + if any(downloaded): + await asyncio.sleep(random.randint(2, 5)) + print(f"Downloaded {member_name}") + else: + print(f"Skipping {member_name}") + + +if __name__ == "__main__": + os.makedirs("data/coach", exist_ok=True) + asyncio.run(main()) diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..5dfbaad --- /dev/null +++ b/poetry.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +package = [] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "81b2fa642d7f2d1219cf80112ace12d689d053d81be7f7addb98144d56fc0fb2" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7e8633f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[tool.poetry] +name = "chesscom-scraper" +version = "0.1.0" +description = "" +authors = ["jrpotter2112@gmail.com"] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.11" +beautifulsoup4 = "^4.12.2" +aiohttp = "^3.8.6" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api"