From 22661fd148833cb877eca245eb1a2b1835dd21f7 Mon Sep 17 00:00:00 2001 From: Kate Date: Mon, 5 Jun 2023 07:07:46 +0100 Subject: [PATCH] Crawl one step. Save to database. --- Pipfile | 1 + group_info.py | 103 +++++++++++++++++++++++++++-------------------- main.py | 57 ++++++++++++++++++++++---- models/group.py | 11 +++++ models/player.py | 9 +++++ player_info.py | 4 +- 6 files changed, 132 insertions(+), 53 deletions(-) create mode 100644 models/group.py create mode 100644 models/player.py diff --git a/Pipfile b/Pipfile index 9f60daf..aae7ae8 100644 --- a/Pipfile +++ b/Pipfile @@ -8,6 +8,7 @@ requests = "*" beautifulsoup4 = "*" lxml = "*" ratelimit = "*" +mongoengine = "*" [dev-packages] diff --git a/group_info.py b/group_info.py index 89856e2..228e5f4 100644 --- a/group_info.py +++ b/group_info.py @@ -1,25 +1,34 @@ -import requests +import requests, time from bs4 import BeautifulSoup from ratelimit import limits, sleep_and_retry # Set up rate limiter, one request per second -CALLS = 1 -RATE_LIMIT = 1 +CALLS = 5 +RATE_LIMIT = 60 -@sleep_and_retry -@limits(calls=CALLS, period=RATE_LIMIT) +def make_request(url): + try: + response = requests.get(url) + if response.status_code == 429: + print("HTTP 429 Too Many Requests received. Pausing for 30 seconds.") + time.sleep(30) + return make_request(url) + response.raise_for_status() + return response + except requests.HTTPError as http_err: + print(f"HTTP error occurred: {http_err}") + return None + except Exception as err: + print(f"Other error occurred: {err}") + return None + def get_group_details(group_url_name): # Regular group page URL group_url = f"https://steamcommunity.com/groups/{group_url_name}" - try: - group_page_response = requests.get(group_url) - group_page_response.raise_for_status() - except requests.HTTPError as http_err: - print(f"HTTP error occurred: {http_err}") - return - except Exception as err: - print(f"Other error occurred: {err}") + group_page_response = make_request(group_url) + + if not group_page_response: return group_page_soup = BeautifulSoup(group_page_response.text, "lxml") @@ -28,44 +37,50 @@ def get_group_details(group_url_name): tag_span = group_page_soup.find("span", {"class": "grouppage_header_abbrev"}) tag = tag_span.text.strip() if tag_span else "No tag" - # Group details XML page URL - group_details_url = ( - f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1" - ) + # Initialize an empty list to store all members + all_members = [] - try: - group_details_response = requests.get(group_details_url) - group_details_response.raise_for_status() - except requests.HTTPError as http_err: - print(f"HTTP error occurred: {http_err}") - return - except Exception as err: - print(f"Other error occurred: {err}") - return + # Start with the first page + next_page_url = f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1" - try: - group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml") + while next_page_url: + # Group details XML page URL + group_details_url = next_page_url - # Group Name - group_name = group_details_soup.find("groupName").text + group_details_response = make_request(group_details_url) - # Group ID64 - group_id64 = group_details_soup.find("groupID64").text + if not group_details_response: + return - # Member List - members = [member.text for member in group_details_soup.find_all("steamID64")] + try: + print(f"[*] Getting page {next_page_url}...") + group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml") - return { - "group_id64": group_id64, - "group_name": group_name, - "group_url": group_url, - "tag": tag, - "members": members, - } + # Group Name + group_name = group_details_soup.find("groupName").text - except Exception as err: - print(f"Error occurred during parsing of group details XML page: {err}") + # Group ID64 + group_id64 = group_details_soup.find("groupID64").text + + # Member List + members = [member.text for member in group_details_soup.find_all("steamID64")] + all_members.extend(members) + + # Get the URL for the next page, if there is one + next_page_link = group_details_soup.find('nextPageLink') + next_page_url = next_page_link.text if next_page_link else None + + except Exception as err: + print(f"Error occurred during parsing of group details XML page: {err}") + + return { + "id64": group_id64, + "name": group_name, + "url": group_url, + "tag": tag, + "members": all_members, + } if __name__ == "__main__": # Replace 'ilovebloop' with the desired group URL name - print(get_group_details("CheeseFraud")) + print(get_group_details("steamworks")) \ No newline at end of file diff --git a/main.py b/main.py index 8fe5d72..1bea2fe 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,9 @@ -from ratelimit import limits, sleep_and_retry +from models.group import Group +from models.player import Player import group_info, player_info +import datetime -# Set up rate limiter, one request per second -CALLS = 1 -RATE_LIMIT = 1 - +# Set the starting group starting_group = "ilovebloop" # Get the members of the starting group @@ -12,13 +11,57 @@ print("[*] Getting members of starting group...") starting_group_details = group_info.get_group_details(starting_group) starting_group_members = starting_group_details["members"] +# Check if the starting group already exists in the database +if not Group.objects(id64=starting_group_details["id64"]).first(): + # If not, create it + Group( + id64=starting_group_details["id64"], + name=starting_group_details["name"], + tag=starting_group_details["tag"], + members=starting_group_members, + link=starting_group, + last_updated=datetime.datetime.now() + ).save() + # Get the groups of the starting group members groups_of_starting_group_members = [] for member in starting_group_members: print(f"[*] Getting groups for member {member}...") - groups_of_starting_group_members.extend(player_info.get_group_links(member)) + member_group_links = player_info.get_group_links(member) + + # Check if the member already exists in the database + if not Player.objects(id64=member).first(): + # If not, create it + Player( + id64=member, + groups=member_group_links, + link=f"https://steamcommunity.com/profiles/{member}", + last_updated=datetime.datetime.now() + ).save() + + groups_of_starting_group_members.extend(member_group_links) # Remove duplicates groups_of_starting_group_members = list(set(groups_of_starting_group_members)) -print(groups_of_starting_group_members) \ No newline at end of file +print(groups_of_starting_group_members) + +# Update or create each group in the database +for group_link in groups_of_starting_group_members: + print(f"[*] Getting group details for group {group_link}...") + + # Check if the group already exists in the database + if not Group.objects(link=group_link).first(): + # If not, create it + group_details = group_info.get_group_details(group_link) + Group( + id64=group_details["id64"], + name=group_details["name"], + tag=group_details["tag"], + members=group_details["members"], + link=group_link, + last_updated=datetime.datetime.now() + ).save() + print(f"[*] Got group details for group {group_details['name']}") + else: + print(f"[*] Group {group_link} already exists in the database. Skipping...") diff --git a/models/group.py b/models/group.py new file mode 100644 index 0000000..9568868 --- /dev/null +++ b/models/group.py @@ -0,0 +1,11 @@ +from mongoengine import connect, Document, StringField, ListField, DateTimeField + +connect(db='steam-group-crawler', host='localhost', port=27017) + +class Group(Document): + id64 = StringField(required=True, unique=True) + name = StringField(required=True) + tag = StringField(required=True) + members = ListField(StringField()) + last_updated = DateTimeField(required=True) + link = StringField(required=True) \ No newline at end of file diff --git a/models/player.py b/models/player.py new file mode 100644 index 0000000..a6c698a --- /dev/null +++ b/models/player.py @@ -0,0 +1,9 @@ +from mongoengine import connect, Document, StringField, ListField, DateTimeField + +connect(db='steam-group-crawler', host='localhost', port=27017) + +class Player(Document): + id64 = StringField(required=True, unique=True) + groups = ListField(StringField()) + last_updated = DateTimeField(required=True) + link = StringField(required=True) \ No newline at end of file diff --git a/player_info.py b/player_info.py index b0e8a76..44c7794 100644 --- a/player_info.py +++ b/player_info.py @@ -3,8 +3,8 @@ from bs4 import BeautifulSoup from ratelimit import limits, sleep_and_retry # Set up rate limiter, one request per second -CALLS = 1 -RATE_LIMIT = 1 +CALLS = 5 +RATE_LIMIT = 60 @sleep_and_retry @limits(calls=CALLS, period=RATE_LIMIT)