From 7308c47eb5b43df333525c544e9569b3a534cef8 Mon Sep 17 00:00:00 2001 From: Joshua Potter Date: Mon, 27 Nov 2023 14:28:10 -0700 Subject: [PATCH] Have requests actually use the `chesscom_request` function. --- main.py | 61 +++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index 18d50dd..92c29e7 100644 --- a/main.py +++ b/main.py @@ -12,34 +12,33 @@ DATA_COACH_LIST = "data/pages/{page_no}.txt" DATA_COACH_DIR = "data/coach/{member_name}" DATA_COACH_FILE = "data/coach/{member_name}/{filename}" +USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)" -async def chesscom_requeset(url): - async with aiohttp.ClientSession() as session: + +async def chesscom_request(url): + body = None + async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session: async with session.get(url) as response: if response.status != 200: print(f"Encountered {response.status} when retrieving {url}.") - return response.text() - pass + else: + body = await response.text() + + return body async def scrape_coach_links(page_no): """Scrape a single coach page listing.""" - links = [] url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}" - async with aiohttp.ClientSession( - headers={ - "User-Agent": "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)", - } - ) as session: - async with session.get(url) as response: - if response.status != 200: - print(f"Encountered {response.status} when retrieving {url}.") - return - html = await response.text() - soup = BeautifulSoup(html, "html.parser") - members = soup.find_all("a", class_="members-categories-username") - for member in members: - links.append(member.get("href")) + response = await chesscom_request(url) + if response is None: + return + + links = [] + soup = BeautifulSoup(response, "html.parser") + members = soup.find_all("a", class_="members-categories-username") + for member in members: + links.append(member.get("href")) return links @@ -59,7 +58,7 @@ async def scrape_all_coach_links(max_pages=64): for link in links: f.write(f"{link}\n") print(f"Downloaded page {i} of coach list.") - await asyncio.sleep(random.randint(3, 7)) + await asyncio.sleep(random.randint(10, 15)) return links @@ -67,19 +66,17 @@ async def scrape_all_coach_links(max_pages=64): async def download_member_info(member_name, filename, url): """Download member-specific content. - @return: True if we downloaded content. False if the download already - exists locally. + @return: True if we make a network request. False otherwise. """ filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename) if os.path.isfile(filepath): return False - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - if response.status != 200: - print(f"Encountered {response.status} when retrieving {url}") - return - with open(filepath, "w") as f: - f.write(await response.text()) + + response = await chesscom_request(url) + if response is not None: + with open(filepath, "w") as f: + f.write(response) + return True @@ -88,7 +85,7 @@ async def main(): for url in [link.strip() for link in links]: member_name = url[len("https://www.chess.com/member/") :] os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True) - downloaded = await asyncio.gather( + made_network_request = await asyncio.gather( download_member_info( member_name, f"{member_name}.html", @@ -105,8 +102,8 @@ async def main(): f"https://www.chess.com/callback/member/stats/{member_name}", ), ) - if any(downloaded): - await asyncio.sleep(random.randint(3, 7)) + if any(made_network_request): + await asyncio.sleep(random.randint(10, 15)) print(f"Downloaded {member_name} info.") else: print(f"Skipping {member_name} download.")