clean up code. infinite random crawl.
This commit is contained in:
parent
bcd06fb05d
commit
883f396794
4 changed files with 146 additions and 91 deletions
1
Pipfile
1
Pipfile
|
@ -9,6 +9,7 @@ beautifulsoup4 = "*"
|
||||||
lxml = "*"
|
lxml = "*"
|
||||||
ratelimit = "*"
|
ratelimit = "*"
|
||||||
mongoengine = "*"
|
mongoengine = "*"
|
||||||
|
black = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
|
|
||||||
|
|
170
main.py
170
main.py
|
@ -1,67 +1,129 @@
|
||||||
from models.group import Group
|
from models.group import Group
|
||||||
from models.player import Player
|
from models.player import Player
|
||||||
import group_info, player_info
|
from steam_api_wrapper import get_group_details, get_players_groups
|
||||||
import datetime
|
import datetime, random
|
||||||
|
|
||||||
# Set the starting group
|
|
||||||
starting_group = "ilovebloop"
|
|
||||||
|
|
||||||
# Get the members of the starting group
|
|
||||||
print("[*] Getting members of starting group...")
|
|
||||||
starting_group_details = group_info.get_group_details(starting_group)
|
|
||||||
starting_group_members = starting_group_details["members"]
|
|
||||||
|
|
||||||
# Check if the starting group already exists in the database
|
|
||||||
if not Group.objects(id64=starting_group_details["id64"]).first():
|
|
||||||
# If not, create it
|
|
||||||
Group(
|
|
||||||
id64=starting_group_details["id64"],
|
|
||||||
name=starting_group_details["name"],
|
|
||||||
tag=starting_group_details["tag"],
|
|
||||||
members=starting_group_members,
|
|
||||||
link=starting_group,
|
|
||||||
last_updated=datetime.datetime.now()
|
|
||||||
).save()
|
|
||||||
|
|
||||||
# Get the groups of the starting group members
|
|
||||||
groups_of_starting_group_members = []
|
|
||||||
for member in starting_group_members:
|
|
||||||
print(f"[*] Getting groups for member {member}...")
|
|
||||||
member_group_links = player_info.get_group_links(member)
|
|
||||||
|
|
||||||
# Check if the member already exists in the database
|
|
||||||
if not Player.objects(id64=member).first():
|
|
||||||
# If not, create it
|
|
||||||
Player(
|
|
||||||
id64=member,
|
|
||||||
groups=member_group_links,
|
|
||||||
link=f"https://steamcommunity.com/profiles/{member}",
|
|
||||||
last_updated=datetime.datetime.now()
|
|
||||||
).save()
|
|
||||||
|
|
||||||
groups_of_starting_group_members.extend(member_group_links)
|
|
||||||
|
|
||||||
# Remove duplicates
|
|
||||||
groups_of_starting_group_members = list(set(groups_of_starting_group_members))
|
|
||||||
|
|
||||||
print(groups_of_starting_group_members)
|
|
||||||
|
|
||||||
# Update or create each group in the database
|
|
||||||
for group_link in groups_of_starting_group_members:
|
|
||||||
print(f"[*] Getting group details for group {group_link}...")
|
|
||||||
|
|
||||||
|
def save_group(group_details: dict):
|
||||||
# Check if the group already exists in the database
|
# Check if the group already exists in the database
|
||||||
if not Group.objects(link=group_link).first():
|
if not Group.objects(id64=group_details["id64"]).first():
|
||||||
# If not, create it
|
# If not, create it
|
||||||
group_details = group_info.get_group_details(group_link)
|
|
||||||
Group(
|
Group(
|
||||||
id64=group_details["id64"],
|
id64=group_details["id64"],
|
||||||
name=group_details["name"],
|
name=group_details["name"],
|
||||||
tag=group_details["tag"],
|
tag=group_details["tag"],
|
||||||
members=group_details["members"],
|
members=group_details["members"],
|
||||||
link=group_link,
|
link=group_details["link"],
|
||||||
last_updated=datetime.datetime.now()
|
last_updated=datetime.datetime.now(),
|
||||||
).save()
|
).save()
|
||||||
print(f"[*] Got group details for group {group_details['name']}")
|
print(f"\r[*] Got group details for group {group_details['name']}", end="")
|
||||||
else:
|
else:
|
||||||
print(f"[*] Group {group_link} already exists in the database. Skipping...")
|
print(
|
||||||
|
f"\r[*] Group {group_details['link']} already exists in the database. Skipping...",
|
||||||
|
end="",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_player(player_details: dict):
|
||||||
|
# Check if the player already exists in the database
|
||||||
|
if not Player.objects(id64=player_details["id64"]).first():
|
||||||
|
# If not, create it
|
||||||
|
Player(
|
||||||
|
id64=player_details["id64"],
|
||||||
|
groups=player_details["groups"],
|
||||||
|
link=player_details["link"],
|
||||||
|
last_updated=datetime.datetime.now(),
|
||||||
|
).save()
|
||||||
|
print(f"\r[*] Got player details for player {player_details['id64']}", end="")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"\r[*] Player {player_details['id64']} already exists in the database. Skipping...",
|
||||||
|
end="",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def scan_group(group_link: str):
|
||||||
|
if not Group.objects(
|
||||||
|
link=str("https://steamcommunity.com/groups/" + group_link)
|
||||||
|
).first():
|
||||||
|
# If the starting group does not exist in the database, get its details
|
||||||
|
starting_group_details = get_group_details(group_link)
|
||||||
|
save_group(starting_group_details)
|
||||||
|
print(f"\r\033[K[+] Added group {group_link} to the database.")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"\r\033[K[ ] Group {group_link} already exists in the database. Skipping..."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def scan_player(player_id: str):
|
||||||
|
if not Player.objects(id64=player_id).first():
|
||||||
|
member_group_links = get_players_groups(player_id)
|
||||||
|
|
||||||
|
# Save player to database
|
||||||
|
save_player(
|
||||||
|
{
|
||||||
|
"id64": player_id,
|
||||||
|
"groups": member_group_links,
|
||||||
|
"link": f"https://steamcommunity.com/profiles/{player_id}",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(f"\r\033[K[+] Added player {player_id} to the database.")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"\r\033[K[ ] Player {player_id} already exists in the database. Skipping..."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_group(max_size=100):
|
||||||
|
# First, we find the groups that satisfy the size condition
|
||||||
|
suitable_groups = Group.objects.filter(
|
||||||
|
__raw__={"$where": f"this.members.length < {max_size}"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# If there are no suitable groups, return None
|
||||||
|
if not suitable_groups:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Then, we randomly select one from the suitable groups
|
||||||
|
random_index = random.randint(0, len(suitable_groups) - 1)
|
||||||
|
|
||||||
|
return suitable_groups[random_index]
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_player():
|
||||||
|
# First, we find the players that satisfy the size condition
|
||||||
|
suitable_players = Player.objects.all()
|
||||||
|
|
||||||
|
# If there are no suitable players, return None
|
||||||
|
if not suitable_players:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Then, we randomly select one from the suitable players
|
||||||
|
random_index = random.randint(0, len(suitable_players) - 1)
|
||||||
|
|
||||||
|
return suitable_players[random_index]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Crawl starting group
|
||||||
|
scan_group("ilovebloop")
|
||||||
|
|
||||||
|
flippy = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
if flippy % 2:
|
||||||
|
# Get a random group
|
||||||
|
random_group = random.choice(get_random_player().groups)
|
||||||
|
if random_group:
|
||||||
|
print(f"[*] Crawling group {random_group}...", end="")
|
||||||
|
scan_group(random_group)
|
||||||
|
else:
|
||||||
|
# Get a random player
|
||||||
|
random_player_id = random.choice(get_random_group().members)
|
||||||
|
print(f"[*] Crawling player {random_player_id}...", end="")
|
||||||
|
scan_player(random_player_id)
|
||||||
|
flippy += 1
|
||||||
|
except IndexError as e:
|
||||||
|
print("[E] IndexError: ", e)
|
||||||
|
continue
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ratelimit import limits, sleep_and_retry
|
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
|
||||||
CALLS = 1
|
|
||||||
RATE_LIMIT = 10
|
|
||||||
|
|
||||||
|
|
||||||
@sleep_and_retry
|
|
||||||
@limits(calls=CALLS, period=RATE_LIMIT)
|
|
||||||
def get_group_links(user_id):
|
|
||||||
url = f"https://steamcommunity.com/profiles/{user_id}/groups/"
|
|
||||||
response = requests.get(url)
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "lxml")
|
|
||||||
|
|
||||||
group_blocks = soup.find_all("div", class_="group_block")
|
|
||||||
group_links = []
|
|
||||||
for block in group_blocks:
|
|
||||||
link_element = block.find("a", class_="linkTitle")
|
|
||||||
if link_element:
|
|
||||||
group_links.append(link_element["href"].split("/")[-1])
|
|
||||||
|
|
||||||
return group_links
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
group_ids = get_group_links("76561198084483014")
|
|
||||||
print(group_ids)
|
|
|
@ -1,10 +1,11 @@
|
||||||
import requests, time
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from ratelimit import limits, sleep_and_retry
|
from ratelimit import limits, sleep_and_retry
|
||||||
|
import time
|
||||||
|
|
||||||
# Set up rate limiter, one request per second
|
# Set up rate limiter, one request per second
|
||||||
CALLS = 1
|
CALLS = 1
|
||||||
RATE_LIMIT = 10
|
RATE_LIMIT = 15
|
||||||
|
|
||||||
|
|
||||||
@sleep_and_retry
|
@sleep_and_retry
|
||||||
|
@ -13,7 +14,10 @@ def make_request(url):
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
if response.status_code == 429:
|
if response.status_code == 429:
|
||||||
print("HTTP 429 Too Many Requests received. Pausing for 30 seconds.")
|
print(
|
||||||
|
f"\r[*]HTTP 429 Too Many Requests received. Pausing for 30 seconds.",
|
||||||
|
end="",
|
||||||
|
)
|
||||||
time.sleep(30)
|
time.sleep(30)
|
||||||
return make_request(url)
|
return make_request(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
@ -62,7 +66,7 @@ def get_group_details(group_url_name):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f"[*] Getting page {next_page_url}...")
|
print(f"\r[*] Getting page {next_page_url}...", end="")
|
||||||
group_details_soup = BeautifulSoup(
|
group_details_soup = BeautifulSoup(
|
||||||
group_details_response.content, "lxml-xml"
|
group_details_response.content, "lxml-xml"
|
||||||
)
|
)
|
||||||
|
@ -92,12 +96,30 @@ def get_group_details(group_url_name):
|
||||||
return {
|
return {
|
||||||
"id64": group_id64,
|
"id64": group_id64,
|
||||||
"name": group_name,
|
"name": group_name,
|
||||||
"url": group_url,
|
"link": group_url,
|
||||||
"tag": tag,
|
"tag": tag,
|
||||||
"members": all_members,
|
"members": all_members,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@sleep_and_retry
|
||||||
|
@limits(calls=CALLS, period=RATE_LIMIT)
|
||||||
|
def get_players_groups(user_id):
|
||||||
|
url = f"https://steamcommunity.com/profiles/{user_id}/groups/"
|
||||||
|
response = make_request(url)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "lxml")
|
||||||
|
|
||||||
|
group_blocks = soup.find_all("div", class_="group_block")
|
||||||
|
group_links = []
|
||||||
|
for block in group_blocks:
|
||||||
|
link_element = block.find("a", class_="linkTitle")
|
||||||
|
if link_element:
|
||||||
|
group_links.append(link_element["href"].split("/")[-1])
|
||||||
|
|
||||||
|
return group_links
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Replace 'ilovebloop' with the desired group URL name
|
group_ids = get_players_groups("76561198084483014")
|
||||||
print(get_group_details("steamworks"))
|
print(group_ids)
|
Loading…
Add table
Reference in a new issue