stop after 500 pages of player IDs

-this only applies to the 14 biggest groups
- is needed to stay under the 16MB document limit in mongodb
This commit is contained in:
Kate 2023-06-07 04:47:20 +01:00
parent 22661fd148
commit bcd06fb05d
4 changed files with 41 additions and 21 deletions

3
.gitignore vendored
View file

@ -165,4 +165,5 @@ config.py
#i dont care if this is not recommended
Pipfile.lock
.DS_Store
.DS_Store
.vscode

View file

@ -13,4 +13,4 @@ mongoengine = "*"
[dev-packages]
[requires]
python_version = "3.11"
python_version = "3.9"

View file

@ -3,9 +3,12 @@ from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry
# Set up rate limiter, one request per second
CALLS = 5
RATE_LIMIT = 60
CALLS = 1
RATE_LIMIT = 10
@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def make_request(url):
try:
response = requests.get(url)
@ -21,7 +24,8 @@ def make_request(url):
except Exception as err:
print(f"Other error occurred: {err}")
return None
def get_group_details(group_url_name):
# Regular group page URL
group_url = f"https://steamcommunity.com/groups/{group_url_name}"
@ -41,9 +45,14 @@ def get_group_details(group_url_name):
all_members = []
# Start with the first page
next_page_url = f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
next_page_url = (
f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
)
while next_page_url:
# Initialize a counter for pages
page_counter = 0
while next_page_url and page_counter < 500:
# Group details XML page URL
group_details_url = next_page_url
@ -54,7 +63,9 @@ def get_group_details(group_url_name):
try:
print(f"[*] Getting page {next_page_url}...")
group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml")
group_details_soup = BeautifulSoup(
group_details_response.content, "lxml-xml"
)
# Group Name
group_name = group_details_soup.find("groupName").text
@ -63,13 +74,18 @@ def get_group_details(group_url_name):
group_id64 = group_details_soup.find("groupID64").text
# Member List
members = [member.text for member in group_details_soup.find_all("steamID64")]
members = [
member.text for member in group_details_soup.find_all("steamID64")
]
all_members.extend(members)
# Get the URL for the next page, if there is one
next_page_link = group_details_soup.find('nextPageLink')
next_page_link = group_details_soup.find("nextPageLink")
next_page_url = next_page_link.text if next_page_link else None
# Increment page counter
page_counter += 1
except Exception as err:
print(f"Error occurred during parsing of group details XML page: {err}")
@ -81,6 +97,7 @@ def get_group_details(group_url_name):
"members": all_members,
}
if __name__ == "__main__":
# Replace 'ilovebloop' with the desired group URL name
print(get_group_details("steamworks"))
print(get_group_details("steamworks"))

View file

@ -3,26 +3,28 @@ from bs4 import BeautifulSoup
from ratelimit import limits, sleep_and_retry
# Set up rate limiter, one request per second
CALLS = 5
RATE_LIMIT = 60
CALLS = 1
RATE_LIMIT = 10
@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def get_group_links(user_id):
url = f'https://steamcommunity.com/profiles/{user_id}/groups/'
url = f"https://steamcommunity.com/profiles/{user_id}/groups/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
soup = BeautifulSoup(response.text, "lxml")
group_blocks = soup.find_all('div', class_='group_block')
group_blocks = soup.find_all("div", class_="group_block")
group_links = []
for block in group_blocks:
link_element = block.find('a', class_='linkTitle')
link_element = block.find("a", class_="linkTitle")
if link_element:
group_links.append(link_element['href'].split('/')[-1])
group_links.append(link_element["href"].split("/")[-1])
return group_links
if __name__ == "__main__":
group_ids = get_group_links('76561198084483014')
print(group_ids)
group_ids = get_group_links("76561198084483014")
print(group_ids)