versioning: 0.0.4, feat: create listing, source image
Build Release EXE / build-windows-exe (release) Successful in 52s

This commit is contained in:
2026-05-08 00:02:59 -04:00
parent e2f87481d6
commit 97c751c585
7 changed files with 297 additions and 12 deletions
+68 -1
View File
@@ -1,8 +1,10 @@
from __future__ import annotations
from html.parser import HTMLParser
import base64
import json
from typing import Any
from urllib.parse import urljoin
import httpx
@@ -41,6 +43,23 @@ class CornerstoneClient:
raise CornerstoneError(f"Cornerstone HTTP {response.status_code}: {response.text[:240]}")
return {"url": str(response.url), "html": response.text}
async def get_image_data(self, url: str, max_bytes: int = 10_000_000) -> dict[str, Any]:
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
response = await client.get(url, headers={"Accept": "image/png,image/jpeg,image/*"})
if response.status_code >= 400:
raise CornerstoneError(f"Cornerstone image HTTP {response.status_code}: {response.text[:240]}")
content_type = response.headers.get("content-type", "").split(";")[0].strip().casefold()
if content_type not in {"image/jpeg", "image/jpg", "image/png"}:
raise CornerstoneError(f"Cornerstone image was not JPG or PNG: {content_type or 'unknown content type'}")
if len(response.content) > max_bytes:
raise CornerstoneError(f"Cornerstone image is larger than {max_bytes} bytes.")
return {
"url": str(response.url),
"content_type": content_type,
"size_bytes": len(response.content),
"image_data": base64.b64encode(response.content).decode("ascii"),
}
async def _get_json(self, path: str) -> Any:
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
response = await client.get(f"{self.base_url}/{path.lstrip('/')}", headers={"Accept": "application/json"})
@@ -58,6 +77,7 @@ class CornerstonePageParser(HTMLParser):
super().__init__(convert_charrefs=True)
self.title = ""
self.tables: list[list[list[str]]] = []
self.images: list[dict[str, str]] = []
self._skip_depth = 0
self._in_title = False
self._current_table: list[list[str]] | None = None
@@ -73,6 +93,29 @@ class CornerstonePageParser(HTMLParser):
return
if tag == "title":
self._in_title = True
elif tag == "meta":
attr_map = self._attrs(attrs)
name = (attr_map.get("property") or attr_map.get("name") or "").casefold()
content = attr_map.get("content") or ""
if content and name in {"og:image", "twitter:image", "twitter:image:src"}:
self.images.append({"url": content, "source": name})
elif tag == "link":
attr_map = self._attrs(attrs)
rel = (attr_map.get("rel") or "").casefold()
href = attr_map.get("href") or ""
if href and "image_src" in rel:
self.images.append({"url": href, "source": "link:image_src"})
elif tag == "img":
attr_map = self._attrs(attrs)
url = attr_map.get("src") or attr_map.get("data-src") or attr_map.get("data-original") or ""
if url:
self.images.append(
{
"url": url,
"alt": attr_map.get("alt") or "",
"source": "img",
}
)
elif tag == "table":
self._current_table = []
elif tag == "tr" and self._current_table is not None:
@@ -110,8 +153,12 @@ class CornerstonePageParser(HTMLParser):
if self._current_cell is not None:
self._current_cell.append(data)
@staticmethod
def _attrs(attrs: list[tuple[str, str | None]]) -> dict[str, str]:
return {key.casefold(): value or "" for key, value in attrs}
def parse_cornerstone_item_page(html: str) -> dict[str, Any]:
def parse_cornerstone_item_page(html: str, page_url: str | None = None) -> dict[str, Any]:
parser = CornerstonePageParser()
parser.feed(html)
info: dict[str, Any] = {"page_title": " ".join(parser.title.split())}
@@ -142,6 +189,9 @@ def parse_cornerstone_item_page(html: str) -> dict[str, Any]:
general[key] = value
info["name"] = general.get("name") or _name_from_title(info["page_title"])
media = _dedupe_media(parser.images, page_url)
if media:
info["media"] = media
if general:
info["general"] = general
info["locations"] = locations
@@ -157,3 +207,20 @@ def _name_from_title(title: str) -> str | None:
if " - " not in title:
return title or None
return title.rsplit(" - ", 1)[-1].strip() or None
def _dedupe_media(images: list[dict[str, str]], page_url: str | None = None) -> list[dict[str, str]]:
media = []
seen = set()
for image in images:
raw_url = (image.get("url") or "").strip()
if not raw_url or raw_url.startswith("data:"):
continue
url = urljoin(page_url or "", raw_url)
if url in seen:
continue
seen.add(url)
item = dict(image)
item["url"] = url
media.append(item)
return media