hltv-api/main.py

import re
import requests
import datetime
from bs4 import BeautifulSoup
from python_utils import converters
import time
import zoneinfo
import tzlocal

HLTV_COOKIE_TIMEZONE = "Europe/Copenhagen"
HLTV_ZONEINFO=zoneinfo.ZoneInfo(HLTV_COOKIE_TIMEZONE)
LOCAL_TIMEZONE_NAME = tzlocal.get_localzone_name()
LOCAL_ZONEINFO = zoneinfo.ZoneInfo(LOCAL_TIMEZONE_NAME)

TEAM_MAP_FOR_RESULTS = []
def _get_all_teams():
    if not TEAM_MAP_FOR_RESULTS:
        teams = get_parsed_page("https://www.hltv.org/stats/teams?minMapCount=0")
        for team in teams.find_all("td", {"class": ["teamCol-teams-overview"], }):
            team = {'id': converters.to_int(team.find("a")["href"].split("/")[-2]), 'name': team.find("a").text, 'url': "https://hltv.org" + team.find("a")["href"]}
            TEAM_MAP_FOR_RESULTS.append(team)

def _findTeamId(teamName: str):
    _get_all_teams()
    for team in TEAM_MAP_FOR_RESULTS:
        if team['name'] == teamName:
            return team['id']
    return None

def _padIfNeeded(numberStr: str):
    if int(numberStr) < 10:
        return str(numberStr).zfill(2)
    else:
        return str(numberStr)

def _monthNameToNumber(monthName: str):
    # Check for the input "Augu" and convert it to "August"
    # This is necessary because the input string may have been sanitized
    # by removing the "st" from the day numbers, such as "21st" -> "21"
    if monthName == "Augu":
        monthName = "August"
    return datetime.datetime.strptime(monthName, '%B').month

def get_parsed_page(url, delay=0.5):
    # This fixes a blocked by cloudflare error i've encountered
    headers = {
        "referer": "https://www.hltv.org/stats",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    cookies = {
        "hltvTimeZone": HLTV_COOKIE_TIMEZONE
    }

    time.sleep(delay)

    return BeautifulSoup(requests.get(url, headers=headers, cookies=cookies).text, "lxml")

def top5teams():
    home = get_parsed_page("https://hltv.org/")
    teams = []
    for team in home.find_all("div", {"class": ["col-box rank"], }):
        team = {'id': _findTeamId(team.text[3:]), 'name': team.text[3:], 'url': "https://hltv.org" + team.find_all("a")[1]["href"]}
        teams.append(team)
    return teams


def top30teams():
    page = get_parsed_page("https://www.hltv.org/ranking/teams/")
    teams = page.find("div", {"class": "ranking"})
    teamlist = []
    for team in teams.find_all("div", {"class": "ranked-team standard-box"}):
        newteam = {'name': team.find('div', {"class": "ranking-header"}).select('.name')[0].text.strip(),
                   'rank': converters.to_int(team.select('.position')[0].text.strip(), regexp=True),
                   'rank-points': converters.to_int(team.find('span', {'class': 'points'}).text, regexp=True),
                   'team-id': _findTeamId(team.find('div', {"class": "ranking-header"}).select('.name')[0].text.strip()),
                   'team-url': "https://hltv.org/team/" + team.find('a', {'class': 'details moreLink'})['href'].split('/')[-1] + "/" + team.find('div', {"class": "ranking-header"}).select('.name')[0].text.strip(),
                   'stats-url': "https://www.hltv.org" + team.find('a', {'class': 'details moreLink'})['href'],
                   'team-players': []}
        for player_div in team.find_all("td", {"class": "player-holder"}):
            player = {}
            player['name'] = player_div.find('img', {'class': 'playerPicture'})['title']
            player['player-id'] = converters.to_int(player_div.select('.pointer')[0]['href'].split("/")[-2])
            player['url'] = "https://www.hltv.org" + player_div.select('.pointer')[0]['href']
            newteam['team-players'].append(player)
        teamlist.append(newteam)
    return teamlist


def top_players():
    page = get_parsed_page("https://www.hltv.org/stats")
    players = page.find_all("div", {"class": "col"})[0]
    playersArray = []
    for player in players.find_all("div", {"class": "top-x-box standard-box"}):
        playerObj = {}
        playerObj['country'] = player.find_all('img')[1]['alt']
        buildName = player.find('img', {'class': 'img'})['alt'].split("'")
        playerObj['name'] = buildName[0].rstrip() + buildName[2]
        playerObj['nickname'] = player.find('a', {'class': 'name'}).text
        playerObj['rating'] = player.find('div', {'class': 'rating'}).find('span', {'class': 'bold'}).text
        playerObj['maps-played'] = player.find('div', {'class': 'average gtSmartphone-only'}).find('span', {'class': 'bold'}).text
        playerObj['url'] = "https://hltv.org" + player.find('a', {'class': 'name'}).get('href')
        playerObj['id'] = converters.to_int(player.find('a', {'class': 'name'}).get('href').split("/")[-2])
        playersArray.append(playerObj)
    return playersArray


def get_players(teamid):
    page = get_parsed_page("https://www.hltv.org/?pageid=362&teamid=" + str(teamid))
    titlebox = page.find("div", {"class": "bodyshot-team"})
    players = []
    for player_link in titlebox.find_all("a"):
        players.append({
            'id': converters.to_int(player_link["href"].split("/")[2]),
            'nickname': player_link["title"],
            'name': player_link.find("img")['title'],
            'url': "https://hltv.org" + player_link["href"]
        })

    return players

def get_team_info(teamid):
    """
    :param teamid: integer (or string consisting of integers)
    :return: dictionary of team

    example team id: 5378 (virtus pro)
    """
    page = get_parsed_page("https://www.hltv.org/?pageid=179&teamid=" + str(teamid))

    team_info = {}
    team_info['team-name'] = page.find("div", {"class": "context-item"}).text

    team_info['team-id'] = _findTeamId(page.find("div", {"class": "context-item"}).text)

    match_page = get_parsed_page("https://www.hltv.org/team/" + str(teamid) +
                                 "/" + str(team_info['team-name']) + "#tab-matchesBox")
    has_not_upcomming_matches = match_page.find(
        "div", {"class": "empty-state"})
    if has_not_upcomming_matches:
        team_info['matches'] = []
    else:
        match_table = match_page.find(
            "table", {"class": "table-container match-table"})
        team_info['matches'] = _get_matches_by_team(match_table)

    current_lineup = _get_current_lineup(page.find_all("div", {"class": "col teammate"}))
    team_info['current-lineup'] = current_lineup

    historical_players = _get_historical_lineup(page.find_all("div", {"class": "col teammate"}))
    team_info['historical-players'] = historical_players

    team_stats_columns = page.find_all("div", {"class": "columns"})
    team_stats = {}

    for columns in team_stats_columns:
        stats = columns.find_all("div", {"class": "col standard-box big-padding"})

        for stat in stats:
            stat_value = stat.find("div", {"class": "large-strong"}).text
            stat_title = stat.find("div", {"class": "small-label-below"}).text
            team_stats[stat_title] = stat_value

    team_info['stats'] = team_stats

    team_info['url'] = "https://hltv.org/stats/team/" + str(teamid) + "/" + str(team_info['team-name'])

    return team_info

def _get_current_lineup(player_anchors):
    """
    helper function for function above
    :return: list of players
    """
    players = []
    for player_anchor in player_anchors[0:5]:
        player = {}
        buildName = player_anchor.find("img", {"class": "container-width"})["alt"].split('\'')
        player['country'] = player_anchor.find("div", {"class": "teammate-info standard-box"}).find("img", {"class": "flag"})["alt"]
        player['name'] = buildName[0].rstrip() + buildName[2]
        player['nickname'] = player_anchor.find("div", {"class": "teammate-info standard-box"}).find("div", {"class": "text-ellipsis"}).text
        player['maps-played'] = int(re.search(r'\d+', player_anchor.find("div", {"class": "teammate-info standard-box"}).find("span").text).group())
        player['url'] = "https://hltv.org" + player_anchor.find("div", {"class": "teammate-info standard-box"}).find("a").get("href")
        player['id'] = converters.to_int(player_anchor.find("div", {"class": "teammate-info standard-box"}).find("a").get("href").split("/")[-2])
        players.append(player)
    return players

def _get_historical_lineup(player_anchors):
    """
    helper function for function above
    :return: list of players
    """
    players = []
    for player_anchor in player_anchors[5::]:
        player = {}
        buildName = player_anchor.find("img", {"class": "container-width"})["alt"].split('\'')
        player['country'] = player_anchor.find("div", {"class": "teammate-info standard-box"}).find("img", {"class": "flag"})["alt"]
        player['name'] = buildName[0].rstrip() + buildName[2]
        player['nickname'] = player_anchor.find("div", {"class": "teammate-info standard-box"}).find("div", {"class": "text-ellipsis"}).text
        player['maps-played'] = int(re.search(r'\d+', player_anchor.find("div", {"class": "teammate-info standard-box"}).find("span").text).group())
        player['url'] = "https://hltv.org" + player_anchor.find("div", {"class": "teammate-info standard-box"}).find("a").get("href")
        player['id'] = converters.to_int(player_anchor.find("div", {"class": "teammate-info standard-box"}).find("a").get("href").split("/")[-2])
        players.append(player)
    return players

def _generate_countdown(date: str, time: str):
    timenow = datetime.datetime.now().astimezone(LOCAL_ZONEINFO).strftime('%Y-%m-%d %H:%M')
    deadline = date + " " + time
    currentTime = datetime.datetime.strptime(timenow,'%Y-%m-%d %H:%M')
    ends = datetime.datetime.strptime(deadline, '%Y-%m-%d %H:%M')
    if currentTime < ends:
        return str(ends - currentTime)
    return None

MATCH_WITH_COUNTDOWN = None
def get_matches():
    global MATCH_WITH_COUNTDOWN
    matches = get_parsed_page("https://www.hltv.org/matches/")
    matches_list = []

    matchdays = matches.find_all("div", {"class": "upcomingMatchesSection"})

    for match in matchdays:
        matchDetails = match.find_all("div", {"class": "upcomingMatch"})
        date = match.find({'span': {'class': 'matchDayHeadline'}}).text.split()[-1]
        for getMatch in matchDetails:
            matchObj = {}

            matchObj['url'] = "https://hltv.org" + getMatch.find("a", {"class": "match a-reset"}).get("href")
            matchObj['match-id'] = converters.to_int(getMatch.find("a", {"class": "match a-reset"}).get("href").split("/")[-2])

            if (date and getMatch.find("div", {"class": "matchTime"})):
                timeFromHLTV = datetime.datetime.strptime(date + " " + getMatch.find("div", {"class": "matchTime"}).text,'%Y-%m-%d %H:%M').replace(tzinfo=HLTV_ZONEINFO)
                timeFromHLTV = timeFromHLTV.astimezone(LOCAL_ZONEINFO)
                matchObj['date'] = timeFromHLTV.strftime('%Y-%m-%d')
                matchObj['time'] = timeFromHLTV.strftime('%H:%M')

                matchObj['countdown'] = _generate_countdown(date, getMatch.find("div", {"class": "matchTime"}).text)

                if not MATCH_WITH_COUNTDOWN and matchObj['countdown']:
                    MATCH_WITH_COUNTDOWN = converters.to_int(getMatch.find("a", {"class": "match a-reset"}).get("href").split("/")[-2])

            if getMatch.find("div", {"class": "matchEvent"}):
                matchObj['event'] = getMatch.find("div", {"class": "matchEvent"}).text.strip()
            else:
                matchObj['event'] = getMatch.find("div", {"class": "matchInfoEmpty"}).text.strip()

            if (getMatch.find_all("div", {"class": "matchTeams"})):
                matchObj['team1'] = getMatch.find_all("div", {"class": "matchTeam"})[0].text.lstrip().rstrip()
                matchObj['team1-id'] = _findTeamId(getMatch.find_all("div", {"class": "matchTeam"})[0].text.lstrip().rstrip())
                matchObj['team2'] = getMatch.find_all("div", {"class": "matchTeam"})[1].text.lstrip().rstrip()
                matchObj['team2-id'] = _findTeamId(getMatch.find_all("div", {"class": "matchTeam"})[1].text.lstrip().rstrip())
            else:
                matchObj['team1'] = None
                matchObj['team1-id'] = None
                matchObj['team2'] = None
                matchObj['team2-id'] = None

            matches_list.append(matchObj)

    return matches_list

def get_results():
    results = get_parsed_page("https://www.hltv.org/results/")

    results_list = []

    pastresults = results.find_all("div", {"class": "results-holder"})

    for result in pastresults:
        resultDiv = result.find_all("div", {"class": "result-con"})

        for res in resultDiv:
            resultObj = {}

            resultObj['url'] = "https://hltv.org" + res.find("a", {"class": "a-reset"}).get("href")

            resultObj['match-id'] = converters.to_int(res.find("a", {"class": "a-reset"}).get("href").split("/")[-2])

            if (res.parent.find("span", {"class": "standard-headline"})):
                dateText = res.parent.find("span", {"class": "standard-headline"}).text.replace("Results for ", "").replace("th", "").replace("rd","").replace("st","").replace("nd","")

                dateArr = dateText.split()

                dateTextFromArrPadded = _padIfNeeded(dateArr[2]) + "-" + _padIfNeeded(_monthNameToNumber(dateArr[0])) + "-" + _padIfNeeded(dateArr[1])
                dateFromHLTV = datetime.datetime.strptime(dateTextFromArrPadded,'%Y-%m-%d').replace(tzinfo=HLTV_ZONEINFO)
                dateFromHLTV = dateFromHLTV.astimezone(LOCAL_ZONEINFO)

                resultObj['date'] = dateFromHLTV.strftime('%Y-%m-%d')
            else:
                dt = datetime.date.today()
                resultObj['date'] = str(dt.day) + '/' + str(dt.month) + '/' + str(dt.year)

            if (res.find("td", {"class": "placeholder-text-cell"})):
                resultObj['event'] = res.find("td", {"class": "placeholder-text-cell"}).text
            elif (res.find("td", {"class": "event"})):
                resultObj['event'] = res.find("td", {"class": "event"}).text
            else:
                resultObj['event'] = None

            if (res.find_all("td", {"class": "team-cell"})):
                resultObj['team1'] = res.find_all("td", {"class": "team-cell"})[0].text.lstrip().rstrip()
                resultObj['team1score'] = converters.to_int(res.find("td", {"class": "result-score"}).find_all("span")[0].text.lstrip().rstrip())
                resultObj['team1-id'] = _findTeamId(res.find_all("td", {"class": "team-cell"})[0].text.lstrip().rstrip())
                resultObj['team2'] = res.find_all("td", {"class": "team-cell"})[1].text.lstrip().rstrip()
                resultObj['team2-id'] = _findTeamId(res.find_all("td", {"class": "team-cell"})[1].text.lstrip().rstrip())
                resultObj['team2score'] = converters.to_int(res.find("td", {"class": "result-score"}).find_all("span")[1].text.lstrip().rstrip())
            else:
                resultObj['team1'] = None
                resultObj['team1-id'] = None
                resultObj['team1score'] = None
                resultObj['team2'] = None
                resultObj['team2-id'] = None
                resultObj['team2score'] = None

            results_list.append(resultObj)

    return results_list

def _get_matches_by_team(table):
    events = table.find_all("tr", {"class": "event-header-cell"})
    event_matches = table.find_all("tbody")
    matches = []
    for i, event in enumerate(events):

        event_name = event.find("a", {"class": "a-reset"}).text
        rows = event_matches[i]("tr", {"class": "team-row"})

        for row in rows[0:len(rows)]:
            match = {}
            dateArr = (row.find(
                "td", {"class": "date-cell"}).find("span").text).split('/')

            dateTextFromArrPadded = _padIfNeeded(dateArr[2]) + "-" + _padIfNeeded(dateArr[1]) + "-" + _padIfNeeded(dateArr[0])

            dateFromHLTV = datetime.datetime.strptime(dateTextFromArrPadded,'%Y-%m-%d').replace(tzinfo=HLTV_ZONEINFO)
            dateFromHLTV = dateFromHLTV.astimezone(LOCAL_ZONEINFO)

            date = dateFromHLTV.strftime('%Y-%m-%d')
            match['date'] = date
            match['teams'] = {}

            if (row.find(
                "td", {"class": "team-center-cell"}).find("a", {"class": "team-name team-1"})):
                match['teams']["team_1"] = row.find(
                    "td", {"class": "team-center-cell"}).find("a", {"class": "team-name team-1"}).text
                match['teams']["team_1_id"] = _findTeamId(row.find( "td", {"class": "team-center-cell"}).find("a", {"class": "team-name team-1"}).text)
            else:
                match['teams']["team_1"] = None
                match['teams']["team_1_id"] = None

            if (row.find(
                "td", {"class": "team-center-cell"}).find("a", {"class": "team-name team-2"})):
                match['teams']["team_2"] = row.find(
                    "td", {"class": "team-center-cell"}).find("a", {"class": "team-name team-2"}).text
                match['teams']["team_2_id"] = _findTeamId(row.find( "td", {"class": "team-center-cell"}).find("a", {"class": "team-name team-2"}).text)
            else:
                match['teams']["team_2"] = None
                match['teams']["team_2_id"] = None

            match["confront_name"] = match['teams']["team_1"] or "TBD" + \
                " X " + match['teams']["team_2"] or "TBD"
            match["championship"] = event_name
            match_url = row.find(
                "td", {"class": "matchpage-button-cell"}).find("a")['href']
            match['match_id'] = converters.to_int(match_url.split("/")[-2])
            match['url'] = "https://www.hltv.org" + match_url
            match['time'] = get_parsed_page("https://www.hltv.org" + match_url).find(
                'div', {"class": "timeAndEvent"}).find('div', {"class": "time"}).text
            matches.append(match)

    return matches


def get_results_by_date(start_date, end_date):
    # Dates like yyyy-mm-dd  (iso)
    results_list = []
    offset = 0
    # Loop through all stats pages
    while True:
        url = "https://www.hltv.org/stats/matches?startDate="+start_date+"&endDate="+end_date+"&offset="+str(offset)

        results = get_parsed_page(url)

        # Total amount of results of the query
        amount = int(results.find("span", attrs={"class": "pagination-data"}).text.split("of")[1].strip())

        # All rows (<tr>s) of the match table
        pastresults = results.find("tbody").find_all("tr")

        # Parse each <tr> element to a result dictionary
        for result in pastresults:
            team_cols = result.find_all("td", {"class": "team-col"})
            t1 = team_cols[0].find("a").text
            t1_id = _findTeamId(team_cols[0].find("a").text)
            t2 = team_cols[1].find("a").text
            t2_id = _findTeamId(team_cols[1].find("a").text)
            t1_score = int(team_cols[0].find_all(attrs={"class": "score"})[0].text.strip()[1:-1])
            t2_score = int(team_cols[1].find_all(attrs={"class": "score"})[0].text.strip()[1:-1])
            map = result.find(attrs={"class": "statsDetail"}).find(attrs={"class": "dynamic-map-name-full"}).text
            event = result.find(attrs={"class": "event-col"}).text
            dateText = result.find(attrs={"class": "date-col"}).find("a").find("div").text
            url = "https://hltv.org" + result.find(attrs={"class": "date-col"}).find("a").get("href")
            match_id = converters.to_int(url.split("/")[-2])
            dateArr = dateText.split("/")
            # TODO: yes, this shouldn't be hardcoded, but I'll be very surprised if this API is still a thing in 21XX
            startingTwoDigitsOfYear = "20"
            dateTextFromArrPadded = startingTwoDigitsOfYear + _padIfNeeded(dateArr[2]) + "-" + _padIfNeeded(dateArr[1]) + "-" + _padIfNeeded(dateArr[0])

            dateFromHLTV = datetime.datetime.strptime(dateTextFromArrPadded,'%Y-%m-%d').replace(tzinfo=HLTV_ZONEINFO)
            dateFromHLTV = dateFromHLTV.astimezone(LOCAL_ZONEINFO)

            date = dateFromHLTV.strftime('%Y-%m-%d')

            result_dict = {"match-id": match_id, "team1": t1, "team1-id": t1_id, "team2": t2, "team2-id": t2_id, "team1score": t1_score,
                           "team2score": t2_score, "date": date, "map": map, "event": event, "url": url}

            # Add this pages results to the result list
            results_list.append(result_dict)

        # Get the next 50 results (next page) or break
        if offset < amount:
            offset += 50
        else:
            break

    return results_list

def get_match_countdown(match_id):
    url = "https://www.hltv.org/matches/" + str(match_id) + "/page"
    match_page = get_parsed_page(url)
    timeAndEvent = match_page.find("div", {"class": "timeAndEvent"})
    date = timeAndEvent.find("div", {"class": "date"}).text
    time = timeAndEvent.find("div", {"class": "time"}).text
    dateArr = date.replace("th of","").replace("rd of","").replace("st of","").replace("nd of","").split()
    dateTextFromArrPadded = _padIfNeeded(dateArr[2]) + "-" + _padIfNeeded(_monthNameToNumber(dateArr[1])) + "-" + _padIfNeeded(dateArr[0])

    dateFromHLTV = datetime.datetime.strptime(dateTextFromArrPadded,'%Y-%m-%d').replace(tzinfo=HLTV_ZONEINFO)
    dateFromHLTV = dateFromHLTV.astimezone(LOCAL_ZONEINFO)

    date = dateFromHLTV.strftime('%Y-%m-%d')

    return _generate_countdown(date, time)

if __name__ == "__main__":
    import pprint
    pp = pprint.PrettyPrinter()

    pp.pprint('top5')
    pp.pprint(top5teams())

    pp.pprint('top30')
    pp.pprint(top30teams())

    pp.pprint('top_players')
    pp.pprint(top_players())

    pp.pprint('get_players')
    pp.pprint(get_players('6665'))

    pp.pprint('get_team_info')
    pp.pprint(get_team_info('6665'))

    pp.pprint('get_matches')
    pp.pprint(get_matches())

    pp.pprint('get_results')
    pp.pprint(get_results())

    pp.pprint('get_results_by_date')
    today_iso = datetime.datetime.today().isoformat().split('T')[0]
    pp.pprint(get_results_by_date(today_iso, today_iso))

    pp.pprint('get_match_countdown')
    pp.pprint(get_match_countdown(MATCH_WITH_COUNTDOWN))