stop after 500 pages of player IDs
-this only applies to the 14 biggest groups - is needed to stay under the 16MB document limit in mongodb
This commit is contained in:
parent
22661fd148
commit
bcd06fb05d
4 changed files with 41 additions and 21 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -166,3 +166,4 @@ config.py
|
||||||
Pipfile.lock
|
Pipfile.lock
|
||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
.vscode
|
2
Pipfile
2
Pipfile
|
@ -13,4 +13,4 @@ mongoengine = "*"
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.11"
|
python_version = "3.9"
|
||||||
|
|
|
@ -3,9 +3,12 @@ from bs4 import BeautifulSoup
|
||||||
from ratelimit import limits, sleep_and_retry
|
from ratelimit import limits, sleep_and_retry
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
# Set up rate limiter, one request per second
|
||||||
CALLS = 5
|
CALLS = 1
|
||||||
RATE_LIMIT = 60
|
RATE_LIMIT = 10
|
||||||
|
|
||||||
|
|
||||||
|
@sleep_and_retry
|
||||||
|
@limits(calls=CALLS, period=RATE_LIMIT)
|
||||||
def make_request(url):
|
def make_request(url):
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
|
@ -22,6 +25,7 @@ def make_request(url):
|
||||||
print(f"Other error occurred: {err}")
|
print(f"Other error occurred: {err}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_group_details(group_url_name):
|
def get_group_details(group_url_name):
|
||||||
# Regular group page URL
|
# Regular group page URL
|
||||||
group_url = f"https://steamcommunity.com/groups/{group_url_name}"
|
group_url = f"https://steamcommunity.com/groups/{group_url_name}"
|
||||||
|
@ -41,9 +45,14 @@ def get_group_details(group_url_name):
|
||||||
all_members = []
|
all_members = []
|
||||||
|
|
||||||
# Start with the first page
|
# Start with the first page
|
||||||
next_page_url = f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
|
next_page_url = (
|
||||||
|
f"https://steamcommunity.com/groups/{group_url_name}/memberslistxml/?xml=1"
|
||||||
|
)
|
||||||
|
|
||||||
while next_page_url:
|
# Initialize a counter for pages
|
||||||
|
page_counter = 0
|
||||||
|
|
||||||
|
while next_page_url and page_counter < 500:
|
||||||
# Group details XML page URL
|
# Group details XML page URL
|
||||||
group_details_url = next_page_url
|
group_details_url = next_page_url
|
||||||
|
|
||||||
|
@ -54,7 +63,9 @@ def get_group_details(group_url_name):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f"[*] Getting page {next_page_url}...")
|
print(f"[*] Getting page {next_page_url}...")
|
||||||
group_details_soup = BeautifulSoup(group_details_response.content, "lxml-xml")
|
group_details_soup = BeautifulSoup(
|
||||||
|
group_details_response.content, "lxml-xml"
|
||||||
|
)
|
||||||
|
|
||||||
# Group Name
|
# Group Name
|
||||||
group_name = group_details_soup.find("groupName").text
|
group_name = group_details_soup.find("groupName").text
|
||||||
|
@ -63,13 +74,18 @@ def get_group_details(group_url_name):
|
||||||
group_id64 = group_details_soup.find("groupID64").text
|
group_id64 = group_details_soup.find("groupID64").text
|
||||||
|
|
||||||
# Member List
|
# Member List
|
||||||
members = [member.text for member in group_details_soup.find_all("steamID64")]
|
members = [
|
||||||
|
member.text for member in group_details_soup.find_all("steamID64")
|
||||||
|
]
|
||||||
all_members.extend(members)
|
all_members.extend(members)
|
||||||
|
|
||||||
# Get the URL for the next page, if there is one
|
# Get the URL for the next page, if there is one
|
||||||
next_page_link = group_details_soup.find('nextPageLink')
|
next_page_link = group_details_soup.find("nextPageLink")
|
||||||
next_page_url = next_page_link.text if next_page_link else None
|
next_page_url = next_page_link.text if next_page_link else None
|
||||||
|
|
||||||
|
# Increment page counter
|
||||||
|
page_counter += 1
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print(f"Error occurred during parsing of group details XML page: {err}")
|
print(f"Error occurred during parsing of group details XML page: {err}")
|
||||||
|
|
||||||
|
@ -81,6 +97,7 @@ def get_group_details(group_url_name):
|
||||||
"members": all_members,
|
"members": all_members,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Replace 'ilovebloop' with the desired group URL name
|
# Replace 'ilovebloop' with the desired group URL name
|
||||||
print(get_group_details("steamworks"))
|
print(get_group_details("steamworks"))
|
|
@ -3,26 +3,28 @@ from bs4 import BeautifulSoup
|
||||||
from ratelimit import limits, sleep_and_retry
|
from ratelimit import limits, sleep_and_retry
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
# Set up rate limiter, one request per second
|
||||||
CALLS = 5
|
CALLS = 1
|
||||||
RATE_LIMIT = 60
|
RATE_LIMIT = 10
|
||||||
|
|
||||||
|
|
||||||
@sleep_and_retry
|
@sleep_and_retry
|
||||||
@limits(calls=CALLS, period=RATE_LIMIT)
|
@limits(calls=CALLS, period=RATE_LIMIT)
|
||||||
def get_group_links(user_id):
|
def get_group_links(user_id):
|
||||||
url = f'https://steamcommunity.com/profiles/{user_id}/groups/'
|
url = f"https://steamcommunity.com/profiles/{user_id}/groups/"
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'lxml')
|
soup = BeautifulSoup(response.text, "lxml")
|
||||||
|
|
||||||
group_blocks = soup.find_all('div', class_='group_block')
|
group_blocks = soup.find_all("div", class_="group_block")
|
||||||
group_links = []
|
group_links = []
|
||||||
for block in group_blocks:
|
for block in group_blocks:
|
||||||
link_element = block.find('a', class_='linkTitle')
|
link_element = block.find("a", class_="linkTitle")
|
||||||
if link_element:
|
if link_element:
|
||||||
group_links.append(link_element['href'].split('/')[-1])
|
group_links.append(link_element["href"].split("/")[-1])
|
||||||
|
|
||||||
return group_links
|
return group_links
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
group_ids = get_group_links('76561198084483014')
|
group_ids = get_group_links("76561198084483014")
|
||||||
print(group_ids)
|
print(group_ids)
|
Loading…
Add table
Reference in a new issue