Files
SeaLoader/steam_required_ids.py
HRiggs 5e68076bbb
All checks were successful
Build and Upload Release (Windows EXE) / Build Windows EXE (release) Successful in 2m32s
Steam Based ID Finiding, Version Control, Storage
2025-09-16 23:31:51 -04:00

274 lines
9.1 KiB
Python

import argparse
import json
import re
import sys
from typing import Iterable, List, Set, Dict
from urllib.parse import urlparse, parse_qs
import requests
from bs4 import BeautifulSoup
WORKSHOP_ITEM_ID_REGEX = re.compile(r"id=(\d+)")
def extract_id_from_href(href: str) -> str | None:
if not href:
return None
# Accept absolute or relative Steam workshop/sharedfiles links
if "filedetails" not in href or "id=" not in href:
return None
try:
parsed = urlparse(href)
# Handle relative URLs like "/sharedfiles/filedetails/?id=123"
query = parsed.query or href.split("?", 1)[1] if "?" in href else ""
qs = parse_qs(query)
if "id" in qs and qs["id"]:
candidate = qs["id"][0]
return candidate if candidate.isdigit() else None
except Exception:
match = WORKSHOP_ITEM_ID_REGEX.search(href)
if match:
return match.group(1)
return None
def parse_main_item_id(url: str) -> str | None:
try:
parsed = urlparse(url)
qs = parse_qs(parsed.query)
if "id" in qs and qs["id"]:
candidate = qs["id"][0]
return candidate if candidate.isdigit() else None
except Exception:
pass
return None
def collect_ids_from_elements(elements: Iterable) -> Set[str]:
ids: Set[str] = set()
for el in elements:
href = getattr(el, "get", None)
if callable(href):
link = el.get("href", "")
else:
link = ""
item_id = extract_id_from_href(link)
if item_id:
ids.add(item_id)
return ids
def extract_required_item_ids_from_html(html: str) -> Set[str]:
soup = BeautifulSoup(html, "html.parser")
# Strategy 1: Look for a section headed "Required items" and parse links within
section_ids: Set[str] = set()
heading_candidates = soup.find_all(string=re.compile(r"^\s*Required\s+items\s*$", re.IGNORECASE))
for heading in heading_candidates:
parent = heading.parent
if parent is None:
continue
# Search within nearby container siblings/descendants for links
container = parent
for _ in range(3): # climb up a few levels to catch the full block
if container is None:
break
links = container.find_all("a", href=True)
section_ids |= collect_ids_from_elements(links)
container = container.parent
if section_ids:
return section_ids
# Strategy 2: Look for any block that contains the sentence used by Steam
hint_blocks = soup.find_all(string=re.compile(r"requires\s+all\s+of\s+the\s+following\s+other\s+items", re.IGNORECASE))
for hint in hint_blocks:
container = hint.parent
for _ in range(3):
if container is None:
break
links = container.find_all("a", href=True)
section_ids |= collect_ids_from_elements(links)
container = container.parent
if section_ids:
return section_ids
# Strategy 3 (fallback): scan all anchors on the page
all_links = soup.find_all("a", href=True)
return collect_ids_from_elements(all_links)
def fetch_page(url: str, timeout: int = 20) -> str:
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
}
# Steam can occasionally require a cookie for age gates. Provide innocuous defaults.
cookies = {
"birthtime": "568022401", # 1987-12-20
"lastagecheckage": "1-January-1990",
"mature_content": "1",
}
resp = requests.get(url, headers=headers, cookies=cookies, timeout=timeout)
resp.raise_for_status()
return resp.text
def extract_required_item_ids(url: str) -> List[str]:
html = fetch_page(url)
found_ids = extract_required_item_ids_from_html(html)
# Ensure the current page's ID is included (user wants main mod too)
current_id = parse_main_item_id(url)
if current_id:
found_ids.add(current_id)
return sorted(found_ids, key=int)
def extract_required_item_ids_for_id(pub_id: str) -> List[str]:
"""Fetch required items for a specific Workshop item ID, including itself."""
page_url = f"https://steamcommunity.com/sharedfiles/filedetails/?id={pub_id}"
html = fetch_page(page_url)
found_ids = extract_required_item_ids_from_html(html)
if pub_id:
found_ids.add(pub_id)
return sorted(found_ids, key=int)
def expand_required_ids_recursive(initial_ids: List[str], max_pages: int = 200) -> List[str]:
"""Expand a set of Workshop IDs by following 'Required items' recursively.
- Starts from initial_ids
- For each id, fetches its page and collects its required items
- Continues breadth-first until no new IDs are found or max_pages is reached
"""
queue: List[str] = [i for i in initial_ids if i and i.isdigit()]
visited: Set[str] = set()
all_ids: Set[str] = set(queue)
while queue and len(visited) < max_pages:
current = queue.pop(0)
if current in visited:
continue
visited.add(current)
try:
deps = extract_required_item_ids_for_id(current)
except Exception:
deps = [current]
for dep in deps:
if dep not in all_ids:
all_ids.add(dep)
if dep not in visited:
queue.append(dep)
return sorted(all_ids, key=int)
def resolve_workshop_names(ids: List[str], timeout: int = 20) -> Dict[str, str]:
"""Resolve Workshop IDs to human-readable titles using Steam API, with HTML fallback.
Uses ISteamRemoteStorage.GetPublishedFileDetails, batching up to 100 IDs per call.
Falls back to scraping each item's page if the API fails.
"""
id_list = [i for i in dict.fromkeys([i for i in ids if i and i.isdigit()])]
if not id_list:
return {}
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36"
),
}
api_url = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
resolved: Dict[str, str] = {}
try:
session = requests.Session()
session.headers.update(headers)
batch_size = 100
for start in range(0, len(id_list), batch_size):
batch = id_list[start:start + batch_size]
data = {"itemcount": len(batch)}
for idx, pub_id in enumerate(batch):
data[f"publishedfileids[{idx}]"] = pub_id
resp = session.post(api_url, data=data, timeout=timeout)
resp.raise_for_status()
payload = resp.json()
details = payload.get("response", {}).get("publishedfiledetails", [])
for entry in details:
if entry.get("result") == 1:
title = entry.get("title")
pub_id = str(entry.get("publishedfileid"))
if pub_id and title:
resolved[pub_id] = title
except Exception:
# API failure; fall back to HTML scraping below
pass
# Fallback for unresolved IDs: scrape the item page
unresolved = [i for i in id_list if i not in resolved]
for pub_id in unresolved:
try:
page_url = f"https://steamcommunity.com/sharedfiles/filedetails/?id={pub_id}"
html = fetch_page(page_url, timeout=timeout)
soup = BeautifulSoup(html, "html.parser")
name = None
og = soup.find("meta", attrs={"property": "og:title"})
if og and og.get("content"):
name = og.get("content").strip()
if not name:
title_div = soup.find("div", class_="workshopItemTitle")
if title_div and title_div.text:
name = title_div.text.strip()
if name:
resolved[pub_id] = name
except Exception:
# Leave unresolved if both methods fail
pass
return resolved
def main() -> None:
parser = argparse.ArgumentParser(description="Extract Steam Workshop 'Required items' IDs from a Workshop item page")
parser.add_argument("url", help="Steam Workshop item URL (e.g., https://steamcommunity.com/sharedfiles/filedetails/?id=XXXXXXXX)")
parser.add_argument("--json", action="store_true", help="Print JSON array instead of plain text")
args = parser.parse_args()
try:
ids = extract_required_item_ids(args.url)
except requests.HTTPError as http_err:
print(f"HTTP error: {http_err}", file=sys.stderr)
sys.exit(2)
except Exception as exc:
print(f"Failed to extract IDs: {exc}", file=sys.stderr)
sys.exit(1)
if args.json:
print(json.dumps(ids))
else:
if not ids:
print("No required item IDs found.")
else:
print("\n".join(ids))
if __name__ == "__main__":
main()