Have requests actually use the `chesscom_request` function.
parent
97cdb3b9cd
commit
7308c47eb5
61
main.py
61
main.py
|
@ -12,34 +12,33 @@ DATA_COACH_LIST = "data/pages/{page_no}.txt"
|
|||
DATA_COACH_DIR = "data/coach/{member_name}"
|
||||
DATA_COACH_FILE = "data/coach/{member_name}/{filename}"
|
||||
|
||||
USER_AGENT = "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)"
|
||||
|
||||
async def chesscom_requeset(url):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
|
||||
async def chesscom_request(url):
|
||||
body = None
|
||||
async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
print(f"Encountered {response.status} when retrieving {url}.")
|
||||
return response.text()
|
||||
pass
|
||||
else:
|
||||
body = await response.text()
|
||||
|
||||
return body
|
||||
|
||||
|
||||
async def scrape_coach_links(page_no):
|
||||
"""Scrape a single coach page listing."""
|
||||
links = []
|
||||
url = f"https://www.chess.com/coaches?sortBy=alphabetical&page={page_no}"
|
||||
async with aiohttp.ClientSession(
|
||||
headers={
|
||||
"User-Agent": "BoardWise (https://github.com/BoardWiseGG/chesscom-scraper)",
|
||||
}
|
||||
) as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
print(f"Encountered {response.status} when retrieving {url}.")
|
||||
return
|
||||
html = await response.text()
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
members = soup.find_all("a", class_="members-categories-username")
|
||||
for member in members:
|
||||
links.append(member.get("href"))
|
||||
response = await chesscom_request(url)
|
||||
if response is None:
|
||||
return
|
||||
|
||||
links = []
|
||||
soup = BeautifulSoup(response, "html.parser")
|
||||
members = soup.find_all("a", class_="members-categories-username")
|
||||
for member in members:
|
||||
links.append(member.get("href"))
|
||||
|
||||
return links
|
||||
|
||||
|
@ -59,7 +58,7 @@ async def scrape_all_coach_links(max_pages=64):
|
|||
for link in links:
|
||||
f.write(f"{link}\n")
|
||||
print(f"Downloaded page {i} of coach list.")
|
||||
await asyncio.sleep(random.randint(3, 7))
|
||||
await asyncio.sleep(random.randint(10, 15))
|
||||
|
||||
return links
|
||||
|
||||
|
@ -67,19 +66,17 @@ async def scrape_all_coach_links(max_pages=64):
|
|||
async def download_member_info(member_name, filename, url):
|
||||
"""Download member-specific content.
|
||||
|
||||
@return: True if we downloaded content. False if the download already
|
||||
exists locally.
|
||||
@return: True if we make a network request. False otherwise.
|
||||
"""
|
||||
filepath = DATA_COACH_FILE.format(member_name=member_name, filename=filename)
|
||||
if os.path.isfile(filepath):
|
||||
return False
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
print(f"Encountered {response.status} when retrieving {url}")
|
||||
return
|
||||
with open(filepath, "w") as f:
|
||||
f.write(await response.text())
|
||||
|
||||
response = await chesscom_request(url)
|
||||
if response is not None:
|
||||
with open(filepath, "w") as f:
|
||||
f.write(response)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
@ -88,7 +85,7 @@ async def main():
|
|||
for url in [link.strip() for link in links]:
|
||||
member_name = url[len("https://www.chess.com/member/") :]
|
||||
os.makedirs(DATA_COACH_DIR.format(member_name=member_name), exist_ok=True)
|
||||
downloaded = await asyncio.gather(
|
||||
made_network_request = await asyncio.gather(
|
||||
download_member_info(
|
||||
member_name,
|
||||
f"{member_name}.html",
|
||||
|
@ -105,8 +102,8 @@ async def main():
|
|||
f"https://www.chess.com/callback/member/stats/{member_name}",
|
||||
),
|
||||
)
|
||||
if any(downloaded):
|
||||
await asyncio.sleep(random.randint(3, 7))
|
||||
if any(made_network_request):
|
||||
await asyncio.sleep(random.randint(10, 15))
|
||||
print(f"Downloaded {member_name} info.")
|
||||
else:
|
||||
print(f"Skipping {member_name} download.")
|
||||
|
|
Loading…
Reference in New Issue