SeaLoader/steam_required_ids.py

import argparse
import json
import re
import sys
from typing import Iterable, List, Set, Dict
from urllib.parse import urlparse, parse_qs

import requests
from bs4 import BeautifulSoup


WORKSHOP_ITEM_ID_REGEX = re.compile(r"id=(\d+)")


def extract_id_from_href(href: str) -> str | None:
    if not href:
        return None

    # Accept absolute or relative Steam workshop/sharedfiles links
    if "filedetails" not in href or "id=" not in href:
        return None

    try:
        parsed = urlparse(href)
        # Handle relative URLs like "/sharedfiles/filedetails/?id=123"
        query = parsed.query or href.split("?", 1)[1] if "?" in href else ""
        qs = parse_qs(query)
        if "id" in qs and qs["id"]:
            candidate = qs["id"][0]
            return candidate if candidate.isdigit() else None
    except Exception:
        match = WORKSHOP_ITEM_ID_REGEX.search(href)
        if match:
            return match.group(1)
    return None


def parse_main_item_id(url: str) -> str | None:
    try:
        parsed = urlparse(url)
        qs = parse_qs(parsed.query)
        if "id" in qs and qs["id"]:
            candidate = qs["id"][0]
            return candidate if candidate.isdigit() else None
    except Exception:
        pass
    return None


def collect_ids_from_elements(elements: Iterable) -> Set[str]:
    ids: Set[str] = set()
    for el in elements:
        href = getattr(el, "get", None)
        if callable(href):
            link = el.get("href", "")
        else:
            link = ""
        item_id = extract_id_from_href(link)
        if item_id:
            ids.add(item_id)
    return ids


def extract_required_item_ids_from_html(html: str) -> Set[str]:
    soup = BeautifulSoup(html, "html.parser")

    # Strategy 1: Look for a section headed "Required items" and parse links within
    section_ids: Set[str] = set()
    heading_candidates = soup.find_all(string=re.compile(r"^\s*Required\s+items\s*$", re.IGNORECASE))
    for heading in heading_candidates:
        parent = heading.parent
        if parent is None:
            continue

        # Search within nearby container siblings/descendants for links
        container = parent
        for _ in range(3):  # climb up a few levels to catch the full block
            if container is None:
                break
            links = container.find_all("a", href=True)
            section_ids |= collect_ids_from_elements(links)
            container = container.parent

    if section_ids:
        return section_ids

    # Strategy 2: Look for any block that contains the sentence used by Steam
    hint_blocks = soup.find_all(string=re.compile(r"requires\s+all\s+of\s+the\s+following\s+other\s+items", re.IGNORECASE))
    for hint in hint_blocks:
        container = hint.parent
        for _ in range(3):
            if container is None:
                break
            links = container.find_all("a", href=True)
            section_ids |= collect_ids_from_elements(links)
            container = container.parent

    if section_ids:
        return section_ids

    # Strategy 3 (fallback): scan all anchors on the page
    all_links = soup.find_all("a", href=True)
    return collect_ids_from_elements(all_links)


def fetch_page(url: str, timeout: int = 20) -> str:
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/126.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    }
    # Steam can occasionally require a cookie for age gates. Provide innocuous defaults.
    cookies = {
        "birthtime": "568022401",  # 1987-12-20
        "lastagecheckage": "1-January-1990",
        "mature_content": "1",
    }
    resp = requests.get(url, headers=headers, cookies=cookies, timeout=timeout)
    resp.raise_for_status()
    return resp.text


def extract_required_item_ids(url: str) -> List[str]:
    html = fetch_page(url)
    found_ids = extract_required_item_ids_from_html(html)

    # Remove the current page's ID if present
    current_id = parse_main_item_id(url)
    if current_id and current_id in found_ids:
        found_ids.remove(current_id)

    return sorted(found_ids, key=int)


def resolve_workshop_names(ids: List[str], timeout: int = 20) -> Dict[str, str]:
    """Resolve Workshop IDs to human-readable titles using Steam API, with HTML fallback.

    Uses ISteamRemoteStorage.GetPublishedFileDetails, batching up to 100 IDs per call.
    Falls back to scraping each item's page if the API fails.
    """
    id_list = [i for i in dict.fromkeys([i for i in ids if i and i.isdigit()])]
    if not id_list:
        return {}

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/126.0.0.0 Safari/537.36"
        ),
    }

    api_url = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
    resolved: Dict[str, str] = {}

    try:
        session = requests.Session()
        session.headers.update(headers)
        batch_size = 100
        for start in range(0, len(id_list), batch_size):
            batch = id_list[start:start + batch_size]
            data = {"itemcount": len(batch)}
            for idx, pub_id in enumerate(batch):
                data[f"publishedfileids[{idx}]"] = pub_id
            resp = session.post(api_url, data=data, timeout=timeout)
            resp.raise_for_status()
            payload = resp.json()
            details = payload.get("response", {}).get("publishedfiledetails", [])
            for entry in details:
                if entry.get("result") == 1:
                    title = entry.get("title")
                    pub_id = str(entry.get("publishedfileid"))
                    if pub_id and title:
                        resolved[pub_id] = title
    except Exception:
        # API failure; fall back to HTML scraping below
        pass

    # Fallback for unresolved IDs: scrape the item page
    unresolved = [i for i in id_list if i not in resolved]
    for pub_id in unresolved:
        try:
            page_url = f"https://steamcommunity.com/sharedfiles/filedetails/?id={pub_id}"
            html = fetch_page(page_url, timeout=timeout)
            soup = BeautifulSoup(html, "html.parser")
            name = None
            og = soup.find("meta", attrs={"property": "og:title"})
            if og and og.get("content"):
                name = og.get("content").strip()
            if not name:
                title_div = soup.find("div", class_="workshopItemTitle")
                if title_div and title_div.text:
                    name = title_div.text.strip()
            if name:
                resolved[pub_id] = name
        except Exception:
            # Leave unresolved if both methods fail
            pass

    return resolved


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract Steam Workshop 'Required items' IDs from a Workshop item page")
    parser.add_argument("url", help="Steam Workshop item URL (e.g., https://steamcommunity.com/sharedfiles/filedetails/?id=XXXXXXXX)")
    parser.add_argument("--json", action="store_true", help="Print JSON array instead of plain text")
    args = parser.parse_args()

    try:
        ids = extract_required_item_ids(args.url)
    except requests.HTTPError as http_err:
        print(f"HTTP error: {http_err}", file=sys.stderr)
        sys.exit(2)
    except Exception as exc:
        print(f"Failed to extract IDs: {exc}", file=sys.stderr)
        sys.exit(1)

    if args.json:
        print(json.dumps(ids))
    else:
        if not ids:
            print("No required item IDs found.")
        else:
            print("\n".join(ids))


if __name__ == "__main__":
    main()