Crawl one step. Save to database.

This commit is contained in:
Kate 2023-06-05 07:07:46 +01:00
parent e8ef1fe6c4
commit 22661fd148
6 changed files with 132 additions and 53 deletions

View file

@ -8,6 +8,7 @@ requests = "*"
beautifulsoup4 = "*" beautifulsoup4 = "*"
lxml = "*" lxml = "*"
ratelimit = "*" ratelimit = "*"
mongoengine = "*"
[dev-packages] [dev-packages]

View file

@ -1,25 +1,34 @@
import requests import requests, time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry from ratelimit import limits, sleep_and_retry
# Set up rate limiter, one request per second # Set up rate limiter, one request per second
CALLS = 1 CALLS = 5
RATE_LIMIT = 1 RATE_LIMIT = 60
@sleep_and_retry def make_request(url):
@limits(calls=CALLS, period=RATE_LIMIT) try:
response = requests.get(url)
if response.status_code == 429:
print("HTTP 429 Too Many Requests received. Pausing for 30 seconds.")
time.sleep(30)
return make_request(url)
response.raise_for_status()
return response
except requests.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
return None
except Exception as err:
print(f"Other error occurred: {err}")
return None
def get_group_details(group_url_name): def get_group_details(group_url_name):
# Regular group page URL # Regular group page URL
group_url = f"https://steamcommunity.com/groups/{group_url_name}" group_url = f"https://steamcommunity.com/groups/{group_url_name}"
try: group_page_response = make_request(group_url)
group_page_response = requests.get(group_url)
group_page_response.raise_for_status() if not group_page_response:
except requests.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
return
except Exception as err:
print(f"Other error occurred: {err}")
return return
group_page_soup = BeautifulSoup(group_page_response.text, "lxml") group_page_soup = BeautifulSoup(group_page_response.text, "lxml")
@ -28,44 +37,50 @@ def get_group_details(group_url_name):
tag_span = group_page_soup.find("span", {"class": "grouppage_header_abbrev"}) tag_span = group_page_soup.find("span", {"class": "grouppage_header_abbrev"})
tag = tag_span.text.strip() if tag_span else "No tag" tag = tag_span.text.strip() if tag_span else "No tag"
# Group details XML page URL # Initialize an empty list to store all members
group_details_url = ( all_members = []
f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
)
try: # Start with the first page
group_details_response = requests.get(group_details_url) next_page_url = f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
group_details_response.raise_for_status()
except requests.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
return
except Exception as err:
print(f"Other error occurred: {err}")
return
try: while next_page_url:
group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml") # Group details XML page URL
group_details_url = next_page_url
# Group Name group_details_response = make_request(group_details_url)
group_name = group_details_soup.find("groupName").text
# Group ID64 if not group_details_response:
group_id64 = group_details_soup.find("groupID64").text return
# Member List try:
members = [member.text for member in group_details_soup.find_all("steamID64")] print(f"[*] Getting page {next_page_url}...")
group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml")
return { # Group Name
"group_id64": group_id64, group_name = group_details_soup.find("groupName").text
"group_name": group_name,
"group_url": group_url,
"tag": tag,
"members": members,
}
except Exception as err: # Group ID64
print(f"Error occurred during parsing of group details XML page: {err}") group_id64 = group_details_soup.find("groupID64").text
# Member List
members = [member.text for member in group_details_soup.find_all("steamID64")]
all_members.extend(members)
# Get the URL for the next page, if there is one
next_page_link = group_details_soup.find('nextPageLink')
next_page_url = next_page_link.text if next_page_link else None
except Exception as err:
print(f"Error occurred during parsing of group details XML page: {err}")
return {
"id64": group_id64,
"name": group_name,
"url": group_url,
"tag": tag,
"members": all_members,
}
if __name__ == "__main__": if __name__ == "__main__":
# Replace 'ilovebloop' with the desired group URL name # Replace 'ilovebloop' with the desired group URL name
print(get_group_details("CheeseFraud")) print(get_group_details("steamworks"))

57
main.py
View file

@ -1,10 +1,9 @@
from ratelimit import limits, sleep_and_retry from models.group import Group
from models.player import Player
import group_info, player_info import group_info, player_info
import datetime
# Set up rate limiter, one request per second # Set the starting group
CALLS = 1
RATE_LIMIT = 1
starting_group = "ilovebloop" starting_group = "ilovebloop"
# Get the members of the starting group # Get the members of the starting group
@ -12,13 +11,57 @@ print("[*] Getting members of starting group...")
starting_group_details = group_info.get_group_details(starting_group) starting_group_details = group_info.get_group_details(starting_group)
starting_group_members = starting_group_details["members"] starting_group_members = starting_group_details["members"]
# Check if the starting group already exists in the database
if not Group.objects(id64=starting_group_details["id64"]).first():
# If not, create it
Group(
id64=starting_group_details["id64"],
name=starting_group_details["name"],
tag=starting_group_details["tag"],
members=starting_group_members,
link=starting_group,
last_updated=datetime.datetime.now()
).save()
# Get the groups of the starting group members # Get the groups of the starting group members
groups_of_starting_group_members = [] groups_of_starting_group_members = []
for member in starting_group_members: for member in starting_group_members:
print(f"[*] Getting groups for member {member}...") print(f"[*] Getting groups for member {member}...")
groups_of_starting_group_members.extend(player_info.get_group_links(member)) member_group_links = player_info.get_group_links(member)
# Check if the member already exists in the database
if not Player.objects(id64=member).first():
# If not, create it
Player(
id64=member,
groups=member_group_links,
link=f"https://steamcommunity.com/profiles/{member}",
last_updated=datetime.datetime.now()
).save()
groups_of_starting_group_members.extend(member_group_links)
# Remove duplicates # Remove duplicates
groups_of_starting_group_members = list(set(groups_of_starting_group_members)) groups_of_starting_group_members = list(set(groups_of_starting_group_members))
print(groups_of_starting_group_members) print(groups_of_starting_group_members)
# Update or create each group in the database
for group_link in groups_of_starting_group_members:
print(f"[*] Getting group details for group {group_link}...")
# Check if the group already exists in the database
if not Group.objects(link=group_link).first():
# If not, create it
group_details = group_info.get_group_details(group_link)
Group(
id64=group_details["id64"],
name=group_details["name"],
tag=group_details["tag"],
members=group_details["members"],
link=group_link,
last_updated=datetime.datetime.now()
).save()
print(f"[*] Got group details for group {group_details['name']}")
else:
print(f"[*] Group {group_link} already exists in the database. Skipping...")

11
models/group.py Normal file
View file

@ -0,0 +1,11 @@
from mongoengine import connect, Document, StringField, ListField, DateTimeField
connect(db='steam-group-crawler', host='localhost', port=27017)
class Group(Document):
id64 = StringField(required=True, unique=True)
name = StringField(required=True)
tag = StringField(required=True)
members = ListField(StringField())
last_updated = DateTimeField(required=True)
link = StringField(required=True)

9
models/player.py Normal file
View file

@ -0,0 +1,9 @@
from mongoengine import connect, Document, StringField, ListField, DateTimeField
connect(db='steam-group-crawler', host='localhost', port=27017)
class Player(Document):
id64 = StringField(required=True, unique=True)
groups = ListField(StringField())
last_updated = DateTimeField(required=True)
link = StringField(required=True)

View file

@ -3,8 +3,8 @@ from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry from ratelimit import limits, sleep_and_retry
# Set up rate limiter, one request per second # Set up rate limiter, one request per second
CALLS = 1 CALLS = 5
RATE_LIMIT = 1 RATE_LIMIT = 60
@sleep_and_retry @sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT) @limits(calls=CALLS, period=RATE_LIMIT)