versioning: 0.0.4, feat: create listing, source image
Build Release EXE / build-windows-exe (release) Successful in 52s
Build Release EXE / build-windows-exe (release) Successful in 52s
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from html.parser import HTMLParser
|
||||
import base64
|
||||
import json
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -41,6 +43,23 @@ class CornerstoneClient:
|
||||
raise CornerstoneError(f"Cornerstone HTTP {response.status_code}: {response.text[:240]}")
|
||||
return {"url": str(response.url), "html": response.text}
|
||||
|
||||
async def get_image_data(self, url: str, max_bytes: int = 10_000_000) -> dict[str, Any]:
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
response = await client.get(url, headers={"Accept": "image/png,image/jpeg,image/*"})
|
||||
if response.status_code >= 400:
|
||||
raise CornerstoneError(f"Cornerstone image HTTP {response.status_code}: {response.text[:240]}")
|
||||
content_type = response.headers.get("content-type", "").split(";")[0].strip().casefold()
|
||||
if content_type not in {"image/jpeg", "image/jpg", "image/png"}:
|
||||
raise CornerstoneError(f"Cornerstone image was not JPG or PNG: {content_type or 'unknown content type'}")
|
||||
if len(response.content) > max_bytes:
|
||||
raise CornerstoneError(f"Cornerstone image is larger than {max_bytes} bytes.")
|
||||
return {
|
||||
"url": str(response.url),
|
||||
"content_type": content_type,
|
||||
"size_bytes": len(response.content),
|
||||
"image_data": base64.b64encode(response.content).decode("ascii"),
|
||||
}
|
||||
|
||||
async def _get_json(self, path: str) -> Any:
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
response = await client.get(f"{self.base_url}/{path.lstrip('/')}", headers={"Accept": "application/json"})
|
||||
@@ -58,6 +77,7 @@ class CornerstonePageParser(HTMLParser):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.title = ""
|
||||
self.tables: list[list[list[str]]] = []
|
||||
self.images: list[dict[str, str]] = []
|
||||
self._skip_depth = 0
|
||||
self._in_title = False
|
||||
self._current_table: list[list[str]] | None = None
|
||||
@@ -73,6 +93,29 @@ class CornerstonePageParser(HTMLParser):
|
||||
return
|
||||
if tag == "title":
|
||||
self._in_title = True
|
||||
elif tag == "meta":
|
||||
attr_map = self._attrs(attrs)
|
||||
name = (attr_map.get("property") or attr_map.get("name") or "").casefold()
|
||||
content = attr_map.get("content") or ""
|
||||
if content and name in {"og:image", "twitter:image", "twitter:image:src"}:
|
||||
self.images.append({"url": content, "source": name})
|
||||
elif tag == "link":
|
||||
attr_map = self._attrs(attrs)
|
||||
rel = (attr_map.get("rel") or "").casefold()
|
||||
href = attr_map.get("href") or ""
|
||||
if href and "image_src" in rel:
|
||||
self.images.append({"url": href, "source": "link:image_src"})
|
||||
elif tag == "img":
|
||||
attr_map = self._attrs(attrs)
|
||||
url = attr_map.get("src") or attr_map.get("data-src") or attr_map.get("data-original") or ""
|
||||
if url:
|
||||
self.images.append(
|
||||
{
|
||||
"url": url,
|
||||
"alt": attr_map.get("alt") or "",
|
||||
"source": "img",
|
||||
}
|
||||
)
|
||||
elif tag == "table":
|
||||
self._current_table = []
|
||||
elif tag == "tr" and self._current_table is not None:
|
||||
@@ -110,8 +153,12 @@ class CornerstonePageParser(HTMLParser):
|
||||
if self._current_cell is not None:
|
||||
self._current_cell.append(data)
|
||||
|
||||
@staticmethod
|
||||
def _attrs(attrs: list[tuple[str, str | None]]) -> dict[str, str]:
|
||||
return {key.casefold(): value or "" for key, value in attrs}
|
||||
|
||||
def parse_cornerstone_item_page(html: str) -> dict[str, Any]:
|
||||
|
||||
def parse_cornerstone_item_page(html: str, page_url: str | None = None) -> dict[str, Any]:
|
||||
parser = CornerstonePageParser()
|
||||
parser.feed(html)
|
||||
info: dict[str, Any] = {"page_title": " ".join(parser.title.split())}
|
||||
@@ -142,6 +189,9 @@ def parse_cornerstone_item_page(html: str) -> dict[str, Any]:
|
||||
general[key] = value
|
||||
|
||||
info["name"] = general.get("name") or _name_from_title(info["page_title"])
|
||||
media = _dedupe_media(parser.images, page_url)
|
||||
if media:
|
||||
info["media"] = media
|
||||
if general:
|
||||
info["general"] = general
|
||||
info["locations"] = locations
|
||||
@@ -157,3 +207,20 @@ def _name_from_title(title: str) -> str | None:
|
||||
if " - " not in title:
|
||||
return title or None
|
||||
return title.rsplit(" - ", 1)[-1].strip() or None
|
||||
|
||||
|
||||
def _dedupe_media(images: list[dict[str, str]], page_url: str | None = None) -> list[dict[str, str]]:
|
||||
media = []
|
||||
seen = set()
|
||||
for image in images:
|
||||
raw_url = (image.get("url") or "").strip()
|
||||
if not raw_url or raw_url.startswith("data:"):
|
||||
continue
|
||||
url = urljoin(page_url or "", raw_url)
|
||||
if url in seen:
|
||||
continue
|
||||
seen.add(url)
|
||||
item = dict(image)
|
||||
item["url"] = url
|
||||
media.append(item)
|
||||
return media
|
||||
|
||||
Reference in New Issue
Block a user