versioning: 0.0.4, feat: create listing, source image

2026-05-08 00:02:59 -04:00
parent e2f87481d6
commit 97c751c585
7 changed files with 297 additions and 12 deletions
@@ -1,8 +1,10 @@
 from __future__ import annotations

 from html.parser import HTMLParser
+import base64
 import json
 from typing import Any
+from urllib.parse import urljoin

 import httpx

@@ -41,6 +43,23 @@ class CornerstoneClient:
            raise CornerstoneError(f"Cornerstone HTTP {response.status_code}: {response.text[:240]}")
        return {"url": str(response.url), "html": response.text}

+    async def get_image_data(self, url: str, max_bytes: int = 10_000_000) -> dict[str, Any]:
+        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
+            response = await client.get(url, headers={"Accept": "image/png,image/jpeg,image/*"})
+        if response.status_code >= 400:
+            raise CornerstoneError(f"Cornerstone image HTTP {response.status_code}: {response.text[:240]}")
+        content_type = response.headers.get("content-type", "").split(";")[0].strip().casefold()
+        if content_type not in {"image/jpeg", "image/jpg", "image/png"}:
+            raise CornerstoneError(f"Cornerstone image was not JPG or PNG: {content_type or 'unknown content type'}")
+        if len(response.content) > max_bytes:
+            raise CornerstoneError(f"Cornerstone image is larger than {max_bytes} bytes.")
+        return {
+            "url": str(response.url),
+            "content_type": content_type,
+            "size_bytes": len(response.content),
+            "image_data": base64.b64encode(response.content).decode("ascii"),
+        }
+
    async def _get_json(self, path: str) -> Any:
        async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
            response = await client.get(f"{self.base_url}/{path.lstrip('/')}", headers={"Accept": "application/json"})
@@ -58,6 +77,7 @@ class CornerstonePageParser(HTMLParser):
        super().__init__(convert_charrefs=True)
        self.title = ""
        self.tables: list[list[list[str]]] = []
+        self.images: list[dict[str, str]] = []
        self._skip_depth = 0
        self._in_title = False
        self._current_table: list[list[str]] | None = None
@@ -73,6 +93,29 @@ class CornerstonePageParser(HTMLParser):
            return
        if tag == "title":
            self._in_title = True
+        elif tag == "meta":
+            attr_map = self._attrs(attrs)
+            name = (attr_map.get("property") or attr_map.get("name") or "").casefold()
+            content = attr_map.get("content") or ""
+            if content and name in {"og:image", "twitter:image", "twitter:image:src"}:
+                self.images.append({"url": content, "source": name})
+        elif tag == "link":
+            attr_map = self._attrs(attrs)
+            rel = (attr_map.get("rel") or "").casefold()
+            href = attr_map.get("href") or ""
+            if href and "image_src" in rel:
+                self.images.append({"url": href, "source": "link:image_src"})
+        elif tag == "img":
+            attr_map = self._attrs(attrs)
+            url = attr_map.get("src") or attr_map.get("data-src") or attr_map.get("data-original") or ""
+            if url:
+                self.images.append(
+                    {
+                        "url": url,
+                        "alt": attr_map.get("alt") or "",
+                        "source": "img",
+                    }
+                )
        elif tag == "table":
            self._current_table = []
        elif tag == "tr" and self._current_table is not None:
@@ -110,8 +153,12 @@ class CornerstonePageParser(HTMLParser):
        if self._current_cell is not None:
            self._current_cell.append(data)

+    @staticmethod
+    def _attrs(attrs: list[tuple[str, str | None]]) -> dict[str, str]:
+        return {key.casefold(): value or "" for key, value in attrs}

-def parse_cornerstone_item_page(html: str) -> dict[str, Any]:
+
+def parse_cornerstone_item_page(html: str, page_url: str | None = None) -> dict[str, Any]:
    parser = CornerstonePageParser()
    parser.feed(html)
    info: dict[str, Any] = {"page_title": " ".join(parser.title.split())}
@@ -142,6 +189,9 @@ def parse_cornerstone_item_page(html: str) -> dict[str, Any]:
                    general[key] = value

    info["name"] = general.get("name") or _name_from_title(info["page_title"])
+    media = _dedupe_media(parser.images, page_url)
+    if media:
+        info["media"] = media
    if general:
        info["general"] = general
    info["locations"] = locations
@@ -157,3 +207,20 @@ def _name_from_title(title: str) -> str | None:
    if " - " not in title:
        return title or None
    return title.rsplit(" - ", 1)[-1].strip() or None
+
+
+def _dedupe_media(images: list[dict[str, str]], page_url: str | None = None) -> list[dict[str, str]]:
+    media = []
+    seen = set()
+    for image in images:
+        raw_url = (image.get("url") or "").strip()
+        if not raw_url or raw_url.startswith("data:"):
+            continue
+        url = urljoin(page_url or "", raw_url)
+        if url in seen:
+            continue
+        seen.add(url)
+        item = dict(image)
+        item["url"] = url
+        media.append(item)
+    return media