Intial Release

2025-09-16 12:47:31 -04:00
parent 7e0462bc23
commit af8b13295a
8 changed files with 724 additions and 0 deletions
--- a/steam_required_ids.py
+++ b/steam_required_ids.py
@@ -0,0 +1,234 @@
+import argparse
+import json
+import re
+import sys
+from typing import Iterable, List, Set, Dict
+from urllib.parse import urlparse, parse_qs
+
+import requests
+from bs4 import BeautifulSoup
+
+
+WORKSHOP_ITEM_ID_REGEX = re.compile(r"id=(\d+)")
+
+
+def extract_id_from_href(href: str) -> str | None:
+    if not href:
+        return None
+
+    # Accept absolute or relative Steam workshop/sharedfiles links
+    if "filedetails" not in href or "id=" not in href:
+        return None
+
+    try:
+        parsed = urlparse(href)
+        # Handle relative URLs like "/sharedfiles/filedetails/?id=123"
+        query = parsed.query or href.split("?", 1)[1] if "?" in href else ""
+        qs = parse_qs(query)
+        if "id" in qs and qs["id"]:
+            candidate = qs["id"][0]
+            return candidate if candidate.isdigit() else None
+    except Exception:
+        match = WORKSHOP_ITEM_ID_REGEX.search(href)
+        if match:
+            return match.group(1)
+    return None
+
+
+def parse_main_item_id(url: str) -> str | None:
+    try:
+        parsed = urlparse(url)
+        qs = parse_qs(parsed.query)
+        if "id" in qs and qs["id"]:
+            candidate = qs["id"][0]
+            return candidate if candidate.isdigit() else None
+    except Exception:
+        pass
+    return None
+
+
+def collect_ids_from_elements(elements: Iterable) -> Set[str]:
+    ids: Set[str] = set()
+    for el in elements:
+        href = getattr(el, "get", None)
+        if callable(href):
+            link = el.get("href", "")
+        else:
+            link = ""
+        item_id = extract_id_from_href(link)
+        if item_id:
+            ids.add(item_id)
+    return ids
+
+
+def extract_required_item_ids_from_html(html: str) -> Set[str]:
+    soup = BeautifulSoup(html, "html.parser")
+
+    # Strategy 1: Look for a section headed "Required items" and parse links within
+    section_ids: Set[str] = set()
+    heading_candidates = soup.find_all(string=re.compile(r"^\s*Required\s+items\s*$", re.IGNORECASE))
+    for heading in heading_candidates:
+        parent = heading.parent
+        if parent is None:
+            continue
+
+        # Search within nearby container siblings/descendants for links
+        container = parent
+        for _ in range(3):  # climb up a few levels to catch the full block
+            if container is None:
+                break
+            links = container.find_all("a", href=True)
+            section_ids |= collect_ids_from_elements(links)
+            container = container.parent
+
+    if section_ids:
+        return section_ids
+
+    # Strategy 2: Look for any block that contains the sentence used by Steam
+    hint_blocks = soup.find_all(string=re.compile(r"requires\s+all\s+of\s+the\s+following\s+other\s+items", re.IGNORECASE))
+    for hint in hint_blocks:
+        container = hint.parent
+        for _ in range(3):
+            if container is None:
+                break
+            links = container.find_all("a", href=True)
+            section_ids |= collect_ids_from_elements(links)
+            container = container.parent
+
+    if section_ids:
+        return section_ids
+
+    # Strategy 3 (fallback): scan all anchors on the page
+    all_links = soup.find_all("a", href=True)
+    return collect_ids_from_elements(all_links)
+
+
+def fetch_page(url: str, timeout: int = 20) -> str:
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/126.0.0.0 Safari/537.36"
+        ),
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+    }
+    # Steam can occasionally require a cookie for age gates. Provide innocuous defaults.
+    cookies = {
+        "birthtime": "568022401",  # 1987-12-20
+        "lastagecheckage": "1-January-1990",
+        "mature_content": "1",
+    }
+    resp = requests.get(url, headers=headers, cookies=cookies, timeout=timeout)
+    resp.raise_for_status()
+    return resp.text
+
+
+def extract_required_item_ids(url: str) -> List[str]:
+    html = fetch_page(url)
+    found_ids = extract_required_item_ids_from_html(html)
+
+    # Remove the current page's ID if present
+    current_id = parse_main_item_id(url)
+    if current_id and current_id in found_ids:
+        found_ids.remove(current_id)
+
+    return sorted(found_ids, key=int)
+
+
+def resolve_workshop_names(ids: List[str], timeout: int = 20) -> Dict[str, str]:
+    """Resolve Workshop IDs to human-readable titles using Steam API, with HTML fallback.
+
+    Uses ISteamRemoteStorage.GetPublishedFileDetails, batching up to 100 IDs per call.
+    Falls back to scraping each item's page if the API fails.
+    """
+    id_list = [i for i in dict.fromkeys([i for i in ids if i and i.isdigit()])]
+    if not id_list:
+        return {}
+
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/126.0.0.0 Safari/537.36"
+        ),
+    }
+
+    api_url = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
+    resolved: Dict[str, str] = {}
+
+    try:
+        session = requests.Session()
+        session.headers.update(headers)
+        batch_size = 100
+        for start in range(0, len(id_list), batch_size):
+            batch = id_list[start:start + batch_size]
+            data = {"itemcount": len(batch)}
+            for idx, pub_id in enumerate(batch):
+                data[f"publishedfileids[{idx}]"] = pub_id
+            resp = session.post(api_url, data=data, timeout=timeout)
+            resp.raise_for_status()
+            payload = resp.json()
+            details = payload.get("response", {}).get("publishedfiledetails", [])
+            for entry in details:
+                if entry.get("result") == 1:
+                    title = entry.get("title")
+                    pub_id = str(entry.get("publishedfileid"))
+                    if pub_id and title:
+                        resolved[pub_id] = title
+    except Exception:
+        # API failure; fall back to HTML scraping below
+        pass
+
+    # Fallback for unresolved IDs: scrape the item page
+    unresolved = [i for i in id_list if i not in resolved]
+    for pub_id in unresolved:
+        try:
+            page_url = f"https://steamcommunity.com/sharedfiles/filedetails/?id={pub_id}"
+            html = fetch_page(page_url, timeout=timeout)
+            soup = BeautifulSoup(html, "html.parser")
+            name = None
+            og = soup.find("meta", attrs={"property": "og:title"})
+            if og and og.get("content"):
+                name = og.get("content").strip()
+            if not name:
+                title_div = soup.find("div", class_="workshopItemTitle")
+                if title_div and title_div.text:
+                    name = title_div.text.strip()
+            if name:
+                resolved[pub_id] = name
+        except Exception:
+            # Leave unresolved if both methods fail
+            pass
+
+    return resolved
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Extract Steam Workshop 'Required items' IDs from a Workshop item page")
+    parser.add_argument("url", help="Steam Workshop item URL (e.g., https://steamcommunity.com/sharedfiles/filedetails/?id=XXXXXXXX)")
+    parser.add_argument("--json", action="store_true", help="Print JSON array instead of plain text")
+    args = parser.parse_args()
+
+    try:
+        ids = extract_required_item_ids(args.url)
+    except requests.HTTPError as http_err:
+        print(f"HTTP error: {http_err}", file=sys.stderr)
+        sys.exit(2)
+    except Exception as exc:
+        print(f"Failed to extract IDs: {exc}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.json:
+        print(json.dumps(ids))
+    else:
+        if not ids:
+            print("No required item IDs found.")
+        else:
+            print("\n".join(ids))
+
+
+if __name__ == "__main__":
+    main()
+
+