From bcd06fb05d364f609e6021c35b999b7e25eb8433 Mon Sep 17 00:00:00 2001 From: Kate Date: Wed, 7 Jun 2023 04:47:20 +0100 Subject: [PATCH] stop after 500 pages of player IDs -this only applies to the 14 biggest groups - is needed to stay under the 16MB document limit in mongodb --- .gitignore | 3 ++- Pipfile | 2 +- group_info.py | 35 ++++++++++++++++++++++++++--------- player_info.py | 22 ++++++++++++---------- 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 1504fcf..f4b2742 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,5 @@ config.py #i dont care if this is not recommended Pipfile.lock -.DS_Store \ No newline at end of file +.DS_Store +.vscode \ No newline at end of file diff --git a/Pipfile b/Pipfile index aae7ae8..6a4b2f9 100644 --- a/Pipfile +++ b/Pipfile @@ -13,4 +13,4 @@ mongoengine = "*" [dev-packages] [requires] -python_version = "3.11" +python_version = "3.9" diff --git a/group_info.py b/group_info.py index 228e5f4..3f8c41a 100644 --- a/group_info.py +++ b/group_info.py @@ -3,9 +3,12 @@ from bs4 import BeautifulSoup from ratelimit import limits, sleep_and_retry # Set up rate limiter, one request per second -CALLS = 5 -RATE_LIMIT = 60 +CALLS = 1 +RATE_LIMIT = 10 + +@sleep_and_retry +@limits(calls=CALLS, period=RATE_LIMIT) def make_request(url): try: response = requests.get(url) @@ -21,7 +24,8 @@ def make_request(url): except Exception as err: print(f"Other error occurred: {err}") return None - + + def get_group_details(group_url_name): # Regular group page URL group_url = f"https://steamcommunity.com/groups/{group_url_name}" @@ -41,9 +45,14 @@ def get_group_details(group_url_name): all_members = [] # Start with the first page - next_page_url = f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1" + next_page_url = ( + f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1" + ) - while next_page_url: + # Initialize a counter for pages + page_counter = 0 + + while next_page_url and page_counter < 500: # Group details XML page URL group_details_url = next_page_url @@ -54,7 +63,9 @@ def get_group_details(group_url_name): try: print(f"[*] Getting page {next_page_url}...") - group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml") + group_details_soup = BeautifulSoup( + group_details_response.content, "lxml-xml" + ) # Group Name group_name = group_details_soup.find("groupName").text @@ -63,13 +74,18 @@ def get_group_details(group_url_name): group_id64 = group_details_soup.find("groupID64").text # Member List - members = [member.text for member in group_details_soup.find_all("steamID64")] + members = [ + member.text for member in group_details_soup.find_all("steamID64") + ] all_members.extend(members) # Get the URL for the next page, if there is one - next_page_link = group_details_soup.find('nextPageLink') + next_page_link = group_details_soup.find("nextPageLink") next_page_url = next_page_link.text if next_page_link else None + # Increment page counter + page_counter += 1 + except Exception as err: print(f"Error occurred during parsing of group details XML page: {err}") @@ -81,6 +97,7 @@ def get_group_details(group_url_name): "members": all_members, } + if __name__ == "__main__": # Replace 'ilovebloop' with the desired group URL name - print(get_group_details("steamworks")) \ No newline at end of file + print(get_group_details("steamworks")) diff --git a/player_info.py b/player_info.py index 44c7794..210c526 100644 --- a/player_info.py +++ b/player_info.py @@ -3,26 +3,28 @@ from bs4 import BeautifulSoup from ratelimit import limits, sleep_and_retry # Set up rate limiter, one request per second -CALLS = 5 -RATE_LIMIT = 60 +CALLS = 1 +RATE_LIMIT = 10 + @sleep_and_retry @limits(calls=CALLS, period=RATE_LIMIT) def get_group_links(user_id): - url = f'https://steamcommunity.com/profiles/{user_id}/groups/' + url = f"https://steamcommunity.com/profiles/{user_id}/groups/" response = requests.get(url) - soup = BeautifulSoup(response.text, 'lxml') + soup = BeautifulSoup(response.text, "lxml") - group_blocks = soup.find_all('div', class_='group_block') + group_blocks = soup.find_all("div", class_="group_block") group_links = [] for block in group_blocks: - link_element = block.find('a', class_='linkTitle') + link_element = block.find("a", class_="linkTitle") if link_element: - group_links.append(link_element['href'].split('/')[-1]) - + group_links.append(link_element["href"].split("/")[-1]) + return group_links + if __name__ == "__main__": - group_ids = get_group_links('76561198084483014') - print(group_ids) \ No newline at end of file + group_ids = get_group_links("76561198084483014") + print(group_ids)