Tools/scripts/find_workshop_ids.py

#!/usr/bin/env python3

"""
Find corresponding Steam Workshop IDs for Project Zomboid mods listed in a mods list file.

This script reads a semicolon-separated list of mod entries (like the contents of mods.txt),
indexes the local Workshop directory for Project Zomboid (app id 108600), and for each mod
attempts to find the Workshop item id.

Matching strategy (in order):
1) If an entry looks like "<digits>/<anything>", extract the digits as the workshop id directly.
2) Exact match on mod IDs parsed from mod.info files.
3) Exact match on mod names (from mod.info or workshop.txt when present).
4) Normalized match (case-insensitive, non-alphanumeric removed) against mod IDs and names.

Outputs a CSV-like file with semicolon-separated fields per input entry:
  original_entry;workshop_id|NOT_FOUND;match_type;matched_value;source_path

Defaults:
- Mods file: mods.txt in current directory
- Workshop directory: G:\SteamLibrary\steamapps\workshop\content\108600
- Output file: workshop_ids_out.txt in current directory

Usage examples:
  python scripts/find_workshop_ids.py
  python scripts/find_workshop_ids.py --mods-file d:\\7. Git\\tools\\mods.txt \
      --workshop-dir G:\\SteamLibrary\\steamapps\\workshop\\content\\108600 \
      --output workshop_ids_out.txt
"""

from __future__ import annotations

import argparse
import csv
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple


DEFAULT_WORKSHOP_DIR = Path(r"G:\SteamLibrary\steamapps\workshop\content\108600")


@dataclass(frozen=True)
class ModRecord:
    workshop_id: str
    mod_id: Optional[str]
    mod_name: Optional[str]
    source_path: Path


def normalize(text: str) -> str:
    """Lowercase and strip all non-alphanumeric characters for fuzzy comparisons."""
    return re.sub(r"[^a-z0-9]+", "", text.lower())


def parse_mods_list(mods_text: str) -> List[str]:
    """Split by semicolons and newlines, strip whitespace, and drop empty entries."""
    raw_tokens = re.split(r"[;\n\r]+", mods_text)
    tokens: List[str] = []
    for token in raw_tokens:
        trimmed = token.strip()
        if trimmed:
            tokens.append(trimmed)
    return tokens


def read_text_file(path: Path) -> str:
    try:
        return path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return ""


def parse_mod_info(mod_info_text: str) -> Tuple[List[str], Optional[str]]:
    """
    Extract mod IDs and mod name from a mod.info file.
    - IDs may appear as `id=SomeId` and may contain multiple separated by commas/semicolons.
    - Name appears as `name=Some Name`.
    """
    ids: List[str] = []
    name: Optional[str] = None
    for line in mod_info_text.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if line.lower().startswith("id="):
            value = line[3:].strip()
            # Split on common separators for multiple IDs in one line
            for part in re.split(r"[,;]", value):
                part_trimmed = part.strip()
                if part_trimmed:
                    ids.append(part_trimmed)
        elif line.lower().startswith("name="):
            value = line[5:].strip()
            if value:
                name = value
    return ids, name


def parse_workshop_txt(workshop_txt: str) -> Optional[str]:
    """Extract a human-readable name from workshop.txt if present."""
    for line in workshop_txt.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if line.lower().startswith("name="):
            value = line[5:].strip()
            if value:
                return value
    return None


def index_workshop(workshop_dir: Path) -> Tuple[Dict[str, List[ModRecord]], Dict[str, List[ModRecord]], Dict[str, List[ModRecord]]]:
    """
    Walk the workshop directory and build two lookup indices:
    - by_id: normalized mod id -> ModRecord list
    - by_name: normalized mod name -> ModRecord list
    - by_workshop: workshop id string -> ModRecord list
    """
    by_id: Dict[str, List[ModRecord]] = {}
    by_name: Dict[str, List[ModRecord]] = {}
    by_workshop: Dict[str, List[ModRecord]] = {}

    if not workshop_dir.exists() or not workshop_dir.is_dir():
        return by_id, by_name, by_workshop

    for child in workshop_dir.iterdir():
        if not child.is_dir():
            continue
        if not child.name.isdigit():
            continue
        workshop_id = child.name

        # Typical structure: <workshop_id>/mods/*/mod.info
        mods_root = child / "mods"
        mod_info_paths: List[Path] = []
        if mods_root.exists() and mods_root.is_dir():
            # mod.info may exist directly inside mods_root or nested one level down
            for sub in mods_root.rglob("mod.info"):
                if sub.is_file():
                    mod_info_paths.append(sub)

        # Fall back to any mod.info anywhere inside the workshop item (less common)
        if not mod_info_paths:
            for sub in child.rglob("mod.info"):
                if sub.is_file():
                    mod_info_paths.append(sub)

        # Try to get workshop name from workshop.txt (optional)
        workshop_name = None
        workshop_txt_path = child / "workshop.txt"
        if workshop_txt_path.exists():
            workshop_name = parse_workshop_txt(read_text_file(workshop_txt_path))

        # If no mod.info was found, still index by the workshop name to help matching
        if not mod_info_paths and workshop_name:
            record = ModRecord(workshop_id=workshop_id, mod_id=None, mod_name=workshop_name, source_path=workshop_txt_path)
            key = normalize(workshop_name)
            by_name.setdefault(key, []).append(record)
            by_workshop.setdefault(workshop_id, []).append(record)
            continue

        for mod_info_path in mod_info_paths:
            text = read_text_file(mod_info_path)
            if not text:
                continue
            mod_ids, mod_name = parse_mod_info(text)

            # Prefer mod.info name; fall back to workshop.txt name if absent
            effective_name = mod_name or workshop_name

            if mod_ids:
                for mod_id in mod_ids:
                    record = ModRecord(
                        workshop_id=workshop_id,
                        mod_id=mod_id,
                        mod_name=effective_name,
                        source_path=mod_info_path,
                    )
                    by_id.setdefault(normalize(mod_id), []).append(record)
                    by_workshop.setdefault(workshop_id, []).append(record)

            if effective_name:
                record_for_name = ModRecord(
                    workshop_id=workshop_id,
                    mod_id=(mod_ids[0] if mod_ids else None),
                    mod_name=effective_name,
                    source_path=mod_info_path,
                )
                by_name.setdefault(normalize(effective_name), []).append(record_for_name)

    return by_id, by_name, by_workshop


def try_extract_numeric_workshop_id(token: str) -> Optional[str]:
    """Return leading numeric id if the token looks like '<digits>/<anything>'."""
    match = re.match(r"^(\d{6,})(?:/|\\).*$", token)
    if match:
        return match.group(1)
    return None


def match_token(
    token: str,
    by_id: Dict[str, List[ModRecord]],
    by_name: Dict[str, List[ModRecord]],
) -> Tuple[str, str, str, str]:
    """
    Attempt to find a workshop id for the given token.
    Returns tuple: (workshop_id_or_NOT_FOUND, match_type, matched_value, source_path)
    """
    # 1) Direct numeric extraction
    numeric = try_extract_numeric_workshop_id(token)
    if numeric:
        return numeric, "provided_numeric", numeric, ""

    norm = normalize(token)

    # 2) Exact id match
    if norm in by_id and by_id[norm]:
        record = by_id[norm][0]
        return record.workshop_id, "mod_id", (record.mod_id or ""), str(record.source_path)

    # 3) Exact name match
    if norm in by_name and by_name[norm]:
        record = by_name[norm][0]
        return record.workshop_id, "mod_name", (record.mod_name or ""), str(record.source_path)

    # 4) Heuristic: strip bracketed tags like [B42]
    token_wo_brackets = re.sub(r"\[[^\]]*\]", "", token).strip()
    if token_wo_brackets and token_wo_brackets != token:
        norm2 = normalize(token_wo_brackets)
        if norm2 in by_id and by_id[norm2]:
            record = by_id[norm2][0]
            return record.workshop_id, "mod_id_normalized", (record.mod_id or ""), str(record.source_path)
        if norm2 in by_name and by_name[norm2]:
            record = by_name[norm2][0]
            return record.workshop_id, "mod_name_normalized", (record.mod_name or ""), str(record.source_path)

    # Not found
    return "NOT_FOUND", "no_match", "", ""


def write_output(
    output_path: Path,
    rows: Iterable[Tuple[str, str, str, str, str]],
    mods_line: Optional[str] = None,
) -> None:
    """Write semicolon-separated output and optionally append a Mods= line at the end."""
    with output_path.open("w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f, delimiter=";", lineterminator="\n", quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["input", "workshop_id", "match_type", "matched_value", "source_path"])
        for row in rows:
            writer.writerow(list(row))
        if mods_line:
            f.write("\n")
            f.write(mods_line)
            f.write("\n")


def main() -> int:
    parser = argparse.ArgumentParser(description="Resolve Project Zomboid mod entries to Workshop IDs")
    parser.add_argument(
        "--mods-file",
        type=Path,
        default=Path("mods.txt"),
        help="Path to the mods list file (semicolon-separated)",
    )
    parser.add_argument(
        "--workshop-dir",
        type=Path,
        default=DEFAULT_WORKSHOP_DIR,
        help="Path to the 108600 workshop content directory",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("workshop_ids_out.txt"),
        help="Path to write the output mapping (CSV with semicolons)",
    )
    args = parser.parse_args()

    mods_file: Path = args.mods_file
    workshop_dir: Path = args.workshop_dir
    output_path: Path = args.output

    mods_text = read_text_file(mods_file)
    if not mods_text:
        print(f"Mods file not found or empty: {mods_file}")
        return 2

    tokens = parse_mods_list(mods_text)
    if not tokens:
        print(f"No entries found in mods file: {mods_file}")
        return 2

    by_id, by_name, by_workshop = index_workshop(workshop_dir)
    if not by_id and not by_name:
        print(f"No workshop items indexed under: {workshop_dir}")
        # Continue anyway so provided numeric ids can still pass through

    rows: List[Tuple[str, str, str, str, str]] = []
    found = 0
    for token in tokens:
        workshop_id, match_type, matched_value, source_path = match_token(token, by_id, by_name)
        if workshop_id != "NOT_FOUND":
            found += 1
        rows.append((token, workshop_id, match_type, matched_value, source_path))

    # Build Mods= line
    def extract_numeric_and_suffix(token_text: str) -> Tuple[Optional[str], Optional[str]]:
        m = re.match(r"^(\d{6,})(?:[\\/](.*))?$", token_text)
        if not m:
            return None, None
        return m.group(1), (m.group(2) or None)

    mods_pairs: List[Tuple[str, str]] = []  # (workshop_id, mod_id)
    seen: set[Tuple[str, str]] = set()
    for token, workshop_id, match_type, matched_value, _source_path in rows:
        if workshop_id == "NOT_FOUND":
            continue
        candidate_mod_ids: List[str] = []

        if match_type in ("mod_id", "mod_id_normalized") and matched_value:
            candidate_mod_ids = [matched_value]
        else:
            # Try to derive from the indexed records
            records = by_workshop.get(workshop_id, [])
            if match_type in ("mod_name", "mod_name_normalized") and matched_value:
                nm = normalize(matched_value)
                for rec in records:
                    if rec.mod_name and normalize(rec.mod_name) == nm and rec.mod_id:
                        candidate_mod_ids.append(rec.mod_id)
            if not candidate_mod_ids:
                num, suffix = extract_numeric_and_suffix(token)
                if num == workshop_id and records:
                    if suffix:
                        suffix_norm = normalize(suffix)
                        # Try folder name or mod_id match
                        for rec in records:
                            folder_name = rec.source_path.parent.name if rec.source_path else ""
                            if rec.mod_id and (normalize(rec.mod_id) == suffix_norm or normalize(folder_name) == suffix_norm):
                                candidate_mod_ids.append(rec.mod_id)
                                break
                    # If still none, include all available mod_ids for this workshop
                    if not candidate_mod_ids:
                        for rec in records:
                            if rec.mod_id:
                                candidate_mod_ids.append(rec.mod_id)
            # Fallback to first non-empty mod_id
            if not candidate_mod_ids and by_workshop.get(workshop_id):
                for rec in by_workshop[workshop_id]:
                    if rec.mod_id:
                        candidate_mod_ids.append(rec.mod_id)
                        break

        for mod_id in candidate_mod_ids:
            key = (workshop_id, mod_id)
            if key not in seen:
                seen.add(key)
                mods_pairs.append(key)

    mods_line = None
    if mods_pairs:
        mods_line = "Mods=" + ";".join([f"{wid}\\{mid}" for wid, mid in mods_pairs]) + ";"

    output_path.parent.mkdir(parents=True, exist_ok=True)
    write_output(output_path, rows, mods_line)

    print(f"Resolved {found}/{len(tokens)} entries. Wrote: {output_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())