Crawl one step. Save to database.
This commit is contained in:
parent
e8ef1fe6c4
commit
22661fd148
6 changed files with 132 additions and 53 deletions
1
Pipfile
1
Pipfile
|
@ -8,6 +8,7 @@ requests = "*"
|
||||||
beautifulsoup4 = "*"
|
beautifulsoup4 = "*"
|
||||||
lxml = "*"
|
lxml = "*"
|
||||||
ratelimit = "*"
|
ratelimit = "*"
|
||||||
|
mongoengine = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
|
|
||||||
|
|
103
group_info.py
103
group_info.py
|
@ -1,25 +1,34 @@
|
||||||
import requests
|
import requests, time
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from ratelimit import limits, sleep_and_retry
|
from ratelimit import limits, sleep_and_retry
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
# Set up rate limiter, one request per second
|
||||||
CALLS = 1
|
CALLS = 5
|
||||||
RATE_LIMIT = 1
|
RATE_LIMIT = 60
|
||||||
|
|
||||||
@sleep_and_retry
|
def make_request(url):
|
||||||
@limits(calls=CALLS, period=RATE_LIMIT)
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 429:
|
||||||
|
print("HTTP 429 Too Many Requests received. Pausing for 30 seconds.")
|
||||||
|
time.sleep(30)
|
||||||
|
return make_request(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
except requests.HTTPError as http_err:
|
||||||
|
print(f"HTTP error occurred: {http_err}")
|
||||||
|
return None
|
||||||
|
except Exception as err:
|
||||||
|
print(f"Other error occurred: {err}")
|
||||||
|
return None
|
||||||
|
|
||||||
def get_group_details(group_url_name):
|
def get_group_details(group_url_name):
|
||||||
# Regular group page URL
|
# Regular group page URL
|
||||||
group_url = f"https://steamcommunity.com/groups/{group_url_name}"
|
group_url = f"https://steamcommunity.com/groups/{group_url_name}"
|
||||||
|
|
||||||
try:
|
group_page_response = make_request(group_url)
|
||||||
group_page_response = requests.get(group_url)
|
|
||||||
group_page_response.raise_for_status()
|
if not group_page_response:
|
||||||
except requests.HTTPError as http_err:
|
|
||||||
print(f"HTTP error occurred: {http_err}")
|
|
||||||
return
|
|
||||||
except Exception as err:
|
|
||||||
print(f"Other error occurred: {err}")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
group_page_soup = BeautifulSoup(group_page_response.text, "lxml")
|
group_page_soup = BeautifulSoup(group_page_response.text, "lxml")
|
||||||
|
@ -28,44 +37,50 @@ def get_group_details(group_url_name):
|
||||||
tag_span = group_page_soup.find("span", {"class": "grouppage_header_abbrev"})
|
tag_span = group_page_soup.find("span", {"class": "grouppage_header_abbrev"})
|
||||||
tag = tag_span.text.strip() if tag_span else "No tag"
|
tag = tag_span.text.strip() if tag_span else "No tag"
|
||||||
|
|
||||||
# Group details XML page URL
|
# Initialize an empty list to store all members
|
||||||
group_details_url = (
|
all_members = []
|
||||||
f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
# Start with the first page
|
||||||
group_details_response = requests.get(group_details_url)
|
next_page_url = f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
|
||||||
group_details_response.raise_for_status()
|
|
||||||
except requests.HTTPError as http_err:
|
|
||||||
print(f"HTTP error occurred: {http_err}")
|
|
||||||
return
|
|
||||||
except Exception as err:
|
|
||||||
print(f"Other error occurred: {err}")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
while next_page_url:
|
||||||
group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml")
|
# Group details XML page URL
|
||||||
|
group_details_url = next_page_url
|
||||||
|
|
||||||
# Group Name
|
group_details_response = make_request(group_details_url)
|
||||||
group_name = group_details_soup.find("groupName").text
|
|
||||||
|
|
||||||
# Group ID64
|
if not group_details_response:
|
||||||
group_id64 = group_details_soup.find("groupID64").text
|
return
|
||||||
|
|
||||||
# Member List
|
try:
|
||||||
members = [member.text for member in group_details_soup.find_all("steamID64")]
|
print(f"[*] Getting page {next_page_url}...")
|
||||||
|
group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml")
|
||||||
|
|
||||||
return {
|
# Group Name
|
||||||
"group_id64": group_id64,
|
group_name = group_details_soup.find("groupName").text
|
||||||
"group_name": group_name,
|
|
||||||
"group_url": group_url,
|
|
||||||
"tag": tag,
|
|
||||||
"members": members,
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as err:
|
# Group ID64
|
||||||
print(f"Error occurred during parsing of group details XML page: {err}")
|
group_id64 = group_details_soup.find("groupID64").text
|
||||||
|
|
||||||
|
# Member List
|
||||||
|
members = [member.text for member in group_details_soup.find_all("steamID64")]
|
||||||
|
all_members.extend(members)
|
||||||
|
|
||||||
|
# Get the URL for the next page, if there is one
|
||||||
|
next_page_link = group_details_soup.find('nextPageLink')
|
||||||
|
next_page_url = next_page_link.text if next_page_link else None
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
print(f"Error occurred during parsing of group details XML page: {err}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id64": group_id64,
|
||||||
|
"name": group_name,
|
||||||
|
"url": group_url,
|
||||||
|
"tag": tag,
|
||||||
|
"members": all_members,
|
||||||
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Replace 'ilovebloop' with the desired group URL name
|
# Replace 'ilovebloop' with the desired group URL name
|
||||||
print(get_group_details("CheeseFraud"))
|
print(get_group_details("steamworks"))
|
57
main.py
57
main.py
|
@ -1,10 +1,9 @@
|
||||||
from ratelimit import limits, sleep_and_retry
|
from models.group import Group
|
||||||
|
from models.player import Player
|
||||||
import group_info, player_info
|
import group_info, player_info
|
||||||
|
import datetime
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
# Set the starting group
|
||||||
CALLS = 1
|
|
||||||
RATE_LIMIT = 1
|
|
||||||
|
|
||||||
starting_group = "ilovebloop"
|
starting_group = "ilovebloop"
|
||||||
|
|
||||||
# Get the members of the starting group
|
# Get the members of the starting group
|
||||||
|
@ -12,13 +11,57 @@ print("[*] Getting members of starting group...")
|
||||||
starting_group_details = group_info.get_group_details(starting_group)
|
starting_group_details = group_info.get_group_details(starting_group)
|
||||||
starting_group_members = starting_group_details["members"]
|
starting_group_members = starting_group_details["members"]
|
||||||
|
|
||||||
|
# Check if the starting group already exists in the database
|
||||||
|
if not Group.objects(id64=starting_group_details["id64"]).first():
|
||||||
|
# If not, create it
|
||||||
|
Group(
|
||||||
|
id64=starting_group_details["id64"],
|
||||||
|
name=starting_group_details["name"],
|
||||||
|
tag=starting_group_details["tag"],
|
||||||
|
members=starting_group_members,
|
||||||
|
link=starting_group,
|
||||||
|
last_updated=datetime.datetime.now()
|
||||||
|
).save()
|
||||||
|
|
||||||
# Get the groups of the starting group members
|
# Get the groups of the starting group members
|
||||||
groups_of_starting_group_members = []
|
groups_of_starting_group_members = []
|
||||||
for member in starting_group_members:
|
for member in starting_group_members:
|
||||||
print(f"[*] Getting groups for member {member}...")
|
print(f"[*] Getting groups for member {member}...")
|
||||||
groups_of_starting_group_members.extend(player_info.get_group_links(member))
|
member_group_links = player_info.get_group_links(member)
|
||||||
|
|
||||||
|
# Check if the member already exists in the database
|
||||||
|
if not Player.objects(id64=member).first():
|
||||||
|
# If not, create it
|
||||||
|
Player(
|
||||||
|
id64=member,
|
||||||
|
groups=member_group_links,
|
||||||
|
link=f"https://steamcommunity.com/profiles/{member}",
|
||||||
|
last_updated=datetime.datetime.now()
|
||||||
|
).save()
|
||||||
|
|
||||||
|
groups_of_starting_group_members.extend(member_group_links)
|
||||||
|
|
||||||
# Remove duplicates
|
# Remove duplicates
|
||||||
groups_of_starting_group_members = list(set(groups_of_starting_group_members))
|
groups_of_starting_group_members = list(set(groups_of_starting_group_members))
|
||||||
|
|
||||||
print(groups_of_starting_group_members)
|
print(groups_of_starting_group_members)
|
||||||
|
|
||||||
|
# Update or create each group in the database
|
||||||
|
for group_link in groups_of_starting_group_members:
|
||||||
|
print(f"[*] Getting group details for group {group_link}...")
|
||||||
|
|
||||||
|
# Check if the group already exists in the database
|
||||||
|
if not Group.objects(link=group_link).first():
|
||||||
|
# If not, create it
|
||||||
|
group_details = group_info.get_group_details(group_link)
|
||||||
|
Group(
|
||||||
|
id64=group_details["id64"],
|
||||||
|
name=group_details["name"],
|
||||||
|
tag=group_details["tag"],
|
||||||
|
members=group_details["members"],
|
||||||
|
link=group_link,
|
||||||
|
last_updated=datetime.datetime.now()
|
||||||
|
).save()
|
||||||
|
print(f"[*] Got group details for group {group_details['name']}")
|
||||||
|
else:
|
||||||
|
print(f"[*] Group {group_link} already exists in the database. Skipping...")
|
||||||
|
|
11
models/group.py
Normal file
11
models/group.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from mongoengine import connect, Document, StringField, ListField, DateTimeField
|
||||||
|
|
||||||
|
connect(db='steam-group-crawler', host='localhost', port=27017)
|
||||||
|
|
||||||
|
class Group(Document):
|
||||||
|
id64 = StringField(required=True, unique=True)
|
||||||
|
name = StringField(required=True)
|
||||||
|
tag = StringField(required=True)
|
||||||
|
members = ListField(StringField())
|
||||||
|
last_updated = DateTimeField(required=True)
|
||||||
|
link = StringField(required=True)
|
9
models/player.py
Normal file
9
models/player.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
from mongoengine import connect, Document, StringField, ListField, DateTimeField
|
||||||
|
|
||||||
|
connect(db='steam-group-crawler', host='localhost', port=27017)
|
||||||
|
|
||||||
|
class Player(Document):
|
||||||
|
id64 = StringField(required=True, unique=True)
|
||||||
|
groups = ListField(StringField())
|
||||||
|
last_updated = DateTimeField(required=True)
|
||||||
|
link = StringField(required=True)
|
|
@ -3,8 +3,8 @@ from bs4 import BeautifulSoup
|
||||||
from ratelimit import limits, sleep_and_retry
|
from ratelimit import limits, sleep_and_retry
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
# Set up rate limiter, one request per second
|
||||||
CALLS = 1
|
CALLS = 5
|
||||||
RATE_LIMIT = 1
|
RATE_LIMIT = 60
|
||||||
|
|
||||||
@sleep_and_retry
|
@sleep_and_retry
|
||||||
@limits(calls=CALLS, period=RATE_LIMIT)
|
@limits(calls=CALLS, period=RATE_LIMIT)
|
||||||
|
|
Loading…
Add table
Reference in a new issue