import argparse import json import re import sys from typing import Iterable, List, Set, Dict from urllib.parse import urlparse, parse_qs import requests from bs4 import BeautifulSoup WORKSHOP_ITEM_ID_REGEX = re.compile(r"id=(\d+)") def extract_id_from_href(href: str) -> str | None: if not href: return None # Accept absolute or relative Steam workshop/sharedfiles links if "filedetails" not in href or "id=" not in href: return None try: parsed = urlparse(href) # Handle relative URLs like "/sharedfiles/filedetails/?id=123" query = parsed.query or href.split("?", 1)[1] if "?" in href else "" qs = parse_qs(query) if "id" in qs and qs["id"]: candidate = qs["id"][0] return candidate if candidate.isdigit() else None except Exception: match = WORKSHOP_ITEM_ID_REGEX.search(href) if match: return match.group(1) return None def parse_main_item_id(url: str) -> str | None: try: parsed = urlparse(url) qs = parse_qs(parsed.query) if "id" in qs and qs["id"]: candidate = qs["id"][0] return candidate if candidate.isdigit() else None except Exception: pass return None def collect_ids_from_elements(elements: Iterable) -> Set[str]: ids: Set[str] = set() for el in elements: href = getattr(el, "get", None) if callable(href): link = el.get("href", "") else: link = "" item_id = extract_id_from_href(link) if item_id: ids.add(item_id) return ids def extract_required_item_ids_from_html(html: str) -> Set[str]: soup = BeautifulSoup(html, "html.parser") # Strategy 1: Look for a section headed "Required items" and parse links within section_ids: Set[str] = set() heading_candidates = soup.find_all(string=re.compile(r"^\s*Required\s+items\s*$", re.IGNORECASE)) for heading in heading_candidates: parent = heading.parent if parent is None: continue # Search within nearby container siblings/descendants for links container = parent for _ in range(3): # climb up a few levels to catch the full block if container is None: break links = container.find_all("a", href=True) section_ids |= collect_ids_from_elements(links) container = container.parent if section_ids: return section_ids # Strategy 2: Look for any block that contains the sentence used by Steam hint_blocks = soup.find_all(string=re.compile(r"requires\s+all\s+of\s+the\s+following\s+other\s+items", re.IGNORECASE)) for hint in hint_blocks: container = hint.parent for _ in range(3): if container is None: break links = container.find_all("a", href=True) section_ids |= collect_ids_from_elements(links) container = container.parent if section_ids: return section_ids # Strategy 3 (fallback): scan all anchors on the page all_links = soup.find_all("a", href=True) return collect_ids_from_elements(all_links) def fetch_page(url: str, timeout: int = 20) -> str: headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/126.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", } # Steam can occasionally require a cookie for age gates. Provide innocuous defaults. cookies = { "birthtime": "568022401", # 1987-12-20 "lastagecheckage": "1-January-1990", "mature_content": "1", } resp = requests.get(url, headers=headers, cookies=cookies, timeout=timeout) resp.raise_for_status() return resp.text def extract_required_item_ids(url: str) -> List[str]: html = fetch_page(url) found_ids = extract_required_item_ids_from_html(html) # Remove the current page's ID if present current_id = parse_main_item_id(url) if current_id and current_id in found_ids: found_ids.remove(current_id) return sorted(found_ids, key=int) def resolve_workshop_names(ids: List[str], timeout: int = 20) -> Dict[str, str]: """Resolve Workshop IDs to human-readable titles using Steam API, with HTML fallback. Uses ISteamRemoteStorage.GetPublishedFileDetails, batching up to 100 IDs per call. Falls back to scraping each item's page if the API fails. """ id_list = [i for i in dict.fromkeys([i for i in ids if i and i.isdigit()])] if not id_list: return {} headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/126.0.0.0 Safari/537.36" ), } api_url = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/" resolved: Dict[str, str] = {} try: session = requests.Session() session.headers.update(headers) batch_size = 100 for start in range(0, len(id_list), batch_size): batch = id_list[start:start + batch_size] data = {"itemcount": len(batch)} for idx, pub_id in enumerate(batch): data[f"publishedfileids[{idx}]"] = pub_id resp = session.post(api_url, data=data, timeout=timeout) resp.raise_for_status() payload = resp.json() details = payload.get("response", {}).get("publishedfiledetails", []) for entry in details: if entry.get("result") == 1: title = entry.get("title") pub_id = str(entry.get("publishedfileid")) if pub_id and title: resolved[pub_id] = title except Exception: # API failure; fall back to HTML scraping below pass # Fallback for unresolved IDs: scrape the item page unresolved = [i for i in id_list if i not in resolved] for pub_id in unresolved: try: page_url = f"https://steamcommunity.com/sharedfiles/filedetails/?id={pub_id}" html = fetch_page(page_url, timeout=timeout) soup = BeautifulSoup(html, "html.parser") name = None og = soup.find("meta", attrs={"property": "og:title"}) if og and og.get("content"): name = og.get("content").strip() if not name: title_div = soup.find("div", class_="workshopItemTitle") if title_div and title_div.text: name = title_div.text.strip() if name: resolved[pub_id] = name except Exception: # Leave unresolved if both methods fail pass return resolved def main() -> None: parser = argparse.ArgumentParser(description="Extract Steam Workshop 'Required items' IDs from a Workshop item page") parser.add_argument("url", help="Steam Workshop item URL (e.g., https://steamcommunity.com/sharedfiles/filedetails/?id=XXXXXXXX)") parser.add_argument("--json", action="store_true", help="Print JSON array instead of plain text") args = parser.parse_args() try: ids = extract_required_item_ids(args.url) except requests.HTTPError as http_err: print(f"HTTP error: {http_err}", file=sys.stderr) sys.exit(2) except Exception as exc: print(f"Failed to extract IDs: {exc}", file=sys.stderr) sys.exit(1) if args.json: print(json.dumps(ids)) else: if not ids: print("No required item IDs found.") else: print("\n".join(ids)) if __name__ == "__main__": main()