Working Ebay Scraping
All checks were successful
Deploy to Server / deploy (push) Successful in 47s

This commit is contained in:
2025-11-03 16:42:52 -05:00
parent a97e79a7d8
commit 338bdf838b
8 changed files with 1354 additions and 91 deletions

View File

@@ -38,9 +38,12 @@ jobs:
git clone https://git.hudsonriggs.systems/HRiggs/Train-ID.git "$APP_DIR"
fi
cd "$APP_DIR"
git pull origin main
# Install Node.js deps and build
"$NPM" ci || "$NPM" install
# Reset any local changes (e.g., package-lock.json, build artifacts) and sync to origin/main
git fetch --prune origin
git reset --hard origin/main
git clean -fdx
# Install Node.js deps and build without modifying lockfile
"$NPM" ci --no-audit --no-fund
"$NPM" run build
# Ensure systemd service exists and restart
if systemctl list-unit-files | grep -q "${SERVICE}.service"; then

View File

@@ -23,6 +23,13 @@ db_port=3306
db_user=trainid
db_pass=changeme
db_name=trainid
# eBay API (optional - falls back to scraping if not configured)
# Get credentials from https://developer.ebay.com/my/keys
# Create an app and use the Client ID and Client Secret
EBAY_CLIENT_ID=YourAppId...
EBAY_CLIENT_SECRET=YourClientSecret...
EBAY_SANDBOX=false # Set to 'true' to use eBay sandbox environment
```
## Local development
@@ -42,6 +49,16 @@ Visit `http://localhost:3000`.
- GET `/api/export/xlsx` → download XLSX export of inventory with embedded thumbnails
- DELETE `/api/items/:id` → delete one
- DELETE `/api/items` → wipe all
- GET `/api/debug/ebay-prices?sku=...` → debug eBay price lookup (shows API and scraping attempts)
- POST `/api/prices/update` → update cached prices for all SKUs from eBay
- GET `/api/price-report` → get price report with item values
### eBay Price Checking
The system uses eBay's Browse API to search for items by SKU. **Important limitations:**
- **Browse API only searches active listings**, not sold/completed ones
- For sold/completed listings, the system falls back to web scraping
- In sandbox mode, test data is limited - you may see 0 results even if the API is working correctly
- The API method: `GET /buy/browse/v1/item_summary/search` with query parameter `q` (SKU)
## Debian 13 (Trixie) LXC install
These steps assume a fresh Debian 13 LXC and deployment directory `/opt/Train-ID` with a system user `deployuser` that has passwordless sudo for service management.

1024
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,8 +18,8 @@
"license": "ISC",
"type": "module",
"dependencies": {
"cors": "^2.8.5",
"cheerio": "^1.0.0",
"cors": "^2.8.5",
"dotenv": "^17.2.3",
"exceljs": "^4.4.0",
"express": "^5.1.0",
@@ -27,7 +27,8 @@
"morgan": "^1.10.1",
"mysql2": "^3.15.3",
"openai": "^6.7.0",
"pdfkit": "^0.17.2"
"pdfkit": "^0.17.2",
"puppeteer": "^24.28.0"
},
"devDependencies": {
"@types/cheerio": "^0.22.35",

View File

@@ -151,6 +151,23 @@ export const db = {
}
return Array.from(set);
},
async listDistinctSkuManufacturers() {
await getReady();
// Get distinct SKU + manufacturer pairs
const [rows] = await pool.query(`
select distinct sku, manufacturer
from items
where sku is not null and sku <> '' and manufacturer is not null and manufacturer <> ''
`);
const pairs: Array<{ sku: string; manufacturer: string }> = [];
for (const r of rows as any[]) {
const s = normalizeSku(r.sku as string);
if (s && r.manufacturer) {
pairs.push({ sku: s, manufacturer: r.manufacturer });
}
}
return pairs;
},
async upsertSkuPrice(sku: string, price: number | null, currency: string = 'USD', source: string = 'ebay') {
await getReady();
const s = normalizeSku(sku);

View File

@@ -86,6 +86,8 @@
<table id="skuPrices">
<thead>
<tr>
<th>ID</th>
<th>Name</th>
<th>SKU</th>
<th>Price (USD)</th>
<th class="hide-sm">Updated</th>
@@ -203,7 +205,9 @@
const tr = document.createElement('tr');
const title = s.description || '';
tr.innerHTML = `
<td title="${title.replaceAll('"', '\\"')}">${s.sku}</td>
<td>${s.id !== null ? s.id : ''}</td>
<td title="${title.replaceAll('"', '\\"')}">${title || ''}</td>
<td>${s.sku}</td>
<td>${s.price !== null ? `$${s.price.toFixed(2)}` : ''}</td>
<td class="hide-sm">${s.updatedAt ? new Date(s.updatedAt).toLocaleString() : ''}</td>
`;

View File

@@ -178,10 +178,14 @@ router.delete('/items', async (_req, res) => {
router.get('/debug/ebay-prices', async (req, res) => {
try {
const skuParam = typeof req.query.sku === 'string' ? req.query.sku : '';
const manufacturerParam = typeof req.query.manufacturer === 'string' ? req.query.manufacturer : null;
const norm = normalizeSku(skuParam);
if (!norm) return res.status(400).json({ error: 'sku required' });
const details = await debugFetchSoldPricesUSDForSku(norm);
res.json({ sku: norm, ...details });
const details = await debugFetchSoldPricesUSDForSku(norm, manufacturerParam);
res.json({
...details,
note: 'Scraping sold/completed listings using manufacturer + SKU for better search results.',
});
} catch (err: any) {
console.error(err);
res.status(500).json({ error: err.message || 'Failed to debug ebay prices' });
@@ -191,22 +195,42 @@ router.get('/debug/ebay-prices', async (req, res) => {
// Update cached prices for all distinct SKUs (on-demand only)
router.post('/prices/update', async (_req, res) => {
try {
const skus = await db.listDistinctSkus();
// Get SKU + manufacturer pairs for better search results
const skuManufacturers = await db.listDistinctSkuManufacturers();
const out: Record<string, number | null> = {};
for (const sku of skus) {
for (const { sku, manufacturer } of skuManufacturers) {
try {
const norm = normalizeSku(sku);
if (!norm) { out[sku] = null; continue; }
const price = await fetchMedianSoldPriceUSDForSku(norm);
await db.upsertSkuPrice(norm, price, 'USD', 'ebay');
out[norm] = price;
const norm = normalizeSku(sku);
if (!norm) { out[sku] = null; continue; }
const price = await fetchMedianSoldPriceUSDForSku(norm, manufacturer);
await db.upsertSkuPrice(norm, price, 'USD', 'ebay');
out[norm] = price;
// small delay to be gentle (reduce block risk)
await new Promise(r => setTimeout(r, 900));
await new Promise(r => setTimeout(r, 1200));
} catch (e) {
console.error('Failed to update price for', sku, e);
out[sku] = null;
console.error('Failed to update price for', sku, e);
out[sku] = null;
}
}
// Also handle SKUs without manufacturer (fallback)
const skusWithoutMfr = await db.listDistinctSkus();
for (const sku of skusWithoutMfr) {
if (out[sku] !== undefined) continue; // Already processed
try {
const norm = normalizeSku(sku);
if (!norm) { out[sku] = null; continue; }
const price = await fetchMedianSoldPriceUSDForSku(norm, null);
await db.upsertSkuPrice(norm, price, 'USD', 'ebay');
out[norm] = price;
await new Promise(r => setTimeout(r, 1200));
} catch (e) {
console.error('Failed to update price for', sku, e);
out[sku] = null;
}
}
res.json({ updated: out });
} catch (err: any) {
console.error(err);
@@ -236,8 +260,8 @@ router.get('/price-report', async (_req, res) => {
};
});
// Build sku list with one description (first encountered)
const skuListMap: Record<string, { sku: string, price: number | null, currency: string, updatedAt: string | null, description: string } > = {};
// Build sku list with one description and ID (first encountered)
const skuListMap: Record<string, { sku: string, price: number | null, currency: string, updatedAt: string | null, description: string, id: number | null } > = {};
for (const it of items) {
const key = normalizeSku(it.sku);
if (!key) continue;
@@ -248,7 +272,8 @@ router.get('/price-report', async (_req, res) => {
price: p ? (p.price !== null ? Number(p.price) : null) : null,
currency: p ? p.currency : 'USD',
updatedAt: p ? p.updatedAt : null,
description: it.description
description: it.description,
id: it.id
};
}
}

View File

@@ -1,5 +1,7 @@
import * as cheerio from 'cheerio';
import puppeteer from 'puppeteer';
// Scraping result type
type ScrapeResult = {
prices: number[];
blocked: boolean;
@@ -13,105 +15,285 @@ async function scrapeSoldPricesUSDPage(html: string, wantDiagnostics = false): P
const sampleTexts: string[] = [];
const samplePrices: number[] = [];
const blocked = html.includes('To continue, please verify') || html.toLowerCase().includes('robot check');
// Detect various eBay bot detection pages
const blocked = html.includes('To continue, please verify')
|| html.toLowerCase().includes('robot check')
|| html.includes('Pardon Our Interruption')
|| html.includes('access to this page has been denied')
|| (html.length < 50000 && html.includes('ebaystatic.com') && !html.includes('srp-results')); // Suspiciously small HTML with eBay assets but no results
// Try multiple listing item selectors (eBay uses different structures)
const listingSelectors = [
'li.s-item',
'li[class*="s-item"]',
'ul.srp-results li',
'div[class*="srp-item"]',
'li[data-view*="item"]',
];
const priceSelectors = [
'.s-item__price',
'.s-item__detail--primary .s-item__price',
'span[class*="s-item__price"]',
'span[class*="price"]',
'div[class*="price"]',
];
$('li.s-item').each((_i, el) => {
const $el = $(el);
let text: string | null = null;
for (const sel of priceSelectors) {
const t = $el.find(sel).first().text().trim();
if (t) {
text = t; selectorHits[sel] = (selectorHits[sel] || 0) + 1; break;
// Try each listing selector
let foundListings = false;
for (const listingSel of listingSelectors) {
const listings = $(listingSel);
if (listings.length > 0) {
foundListings = true;
if (wantDiagnostics) selectorHits[`listing_${listingSel}`] = listings.length;
listings.each((_i, el) => {
const $el = $(el);
let text: string | null = null;
// Try price selectors
for (const sel of priceSelectors) {
const t = $el.find(sel).first().text().trim();
if (t) {
text = t;
selectorHits[`price_${sel}`] = (selectorHits[`price_${sel}`] || 0) + 1;
break;
}
}
// Regex fallback: look for price patterns in the listing HTML
if (!text) {
const htmlFrag = $el.html() || '';
const priceMatches = htmlFrag.match(/\$\s*[0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?/g);
if (priceMatches && priceMatches.length > 0) {
// Take the first price match, but prefer ones that look like sold prices
text = priceMatches[0];
selectorHits['price_regex'] = (selectorHits['price_regex'] || 0) + 1;
}
}
if (!text) return;
if (wantDiagnostics && sampleTexts.length < 10) sampleTexts.push(text);
if (!text.includes('$')) return;
// Extract price value
const single = text.split(' to ')[0].split(' ')[0]; // Handle ranges and extra text
const num = single.replace(/[^0-9.]/g, '');
if (!num) return;
const value = Number(num);
if (!Number.isFinite(value) || value <= 0 || value > 1000000) return; // Sanity check
prices.push(value);
if (wantDiagnostics && samplePrices.length < 10) samplePrices.push(value);
});
if (prices.length > 0) break; // Found prices, no need to try other listing selectors
}
}
// If no listings found with standard selectors, try broad regex search as last resort
if (!foundListings && prices.length === 0 && !blocked) {
const priceMatches = html.match(/\$\s*[0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?/g);
if (priceMatches) {
for (const match of priceMatches.slice(0, 20)) { // Limit to first 20 matches
const num = match.replace(/[^0-9.]/g, '');
const value = Number(num);
if (Number.isFinite(value) && value > 0 && value < 1000000) {
prices.push(value);
if (wantDiagnostics && samplePrices.length < 10) samplePrices.push(value);
}
}
if (wantDiagnostics) selectorHits['fallback_regex'] = priceMatches.length;
}
if (!text) {
// regex fallback within this listing's HTML
const htmlFrag = $el.html() || '';
const m = htmlFrag.match(/\$\s*[0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?/);
if (m) text = m[0];
}
if (!text) return;
if (wantDiagnostics && sampleTexts.length < 10) sampleTexts.push(text);
if (!text.includes('$')) return;
const single = text.split(' to ')[0];
const num = single.replace(/[^0-9.]/g, '');
if (!num) return;
const value = Number(num);
if (!Number.isFinite(value) || value <= 0) return;
prices.push(value);
if (wantDiagnostics && samplePrices.length < 10) samplePrices.push(value);
});
}
return { prices, blocked, diagnostics: wantDiagnostics ? { selectorHits, sampleTexts, samplePrices } : undefined };
}
async function fetchSoldSearchHtml(query: string, page = 1): Promise<{ ok: boolean; html: string }> {
const url = `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}&LH_Sold=1&LH_Complete=1&rt=nc&_ipg=200&_pgn=${page}`;
const res = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9'
} as any
} as any);
const html = await res.text();
return { ok: res.ok, html };
/**
* Build search query combining manufacturer and SKU
*/
function buildSearchQuery(manufacturer: string | null, sku: string): string[] {
const queries: string[] = [];
if (manufacturer && manufacturer.trim()) {
const cleanMfr = manufacturer.trim();
const cleanSku = sku.trim();
// Try various combinations for better matching
queries.push(`"${cleanMfr}" "${cleanSku}"`); // Exact match both
queries.push(`${cleanMfr} ${cleanSku}`); // Both without quotes
queries.push(`"${cleanMfr} ${cleanSku}"`); // Combined exact match
queries.push(`${cleanMfr} "${cleanSku}"`); // Mfr loose, SKU exact
}
// Fallback to SKU only if no manufacturer
queries.push(`"${sku}"`);
queries.push(sku);
return queries;
}
export async function fetchMedianSoldPriceUSDForSku(sku: string): Promise<number | null> {
// Try quoted exact search first
const tryQueries = [
`"${sku}"`,
sku // fallback without quotes
];
for (const q of tryQueries) {
let all: number[] = [];
for (let page = 1; page <= 2; page++) {
const { ok, html } = await fetchSoldSearchHtml(q, page);
if (!ok) continue;
const { prices, blocked } = await scrapeSoldPricesUSDPage(html);
if (blocked) return null;
all = all.concat(prices);
if (all.length === 0) {
// small delay before next page to be gentle
await new Promise(r => setTimeout(r, 600));
}
// Puppeteer browser instance (reused across requests)
let browserInstance: any = null;
/**
* Get or create Puppeteer browser instance
*/
async function getBrowser() {
if (!browserInstance) {
browserInstance = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
}
return browserInstance;
}
/**
* Fetch sold/completed listings HTML from eBay search using Puppeteer
*/
async function fetchSoldSearchHtml(query: string, pageNum = 1): Promise<{ ok: boolean; html: string }> {
const url = `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}&LH_Sold=1&LH_Complete=1&rt=nc&_ipg=200&_pgn=${pageNum}`;
try {
const browser = await getBrowser();
const page = await browser.newPage();
// Set realistic viewport and user agent
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36');
// Navigate to the search page
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for listings to load (check for s-item elements or price elements)
try {
await page.waitForSelector('li.s-item, ul.srp-results li, [class*="s-item"]', { timeout: 10000 });
} catch (e) {
// Listings might not have loaded, but continue anyway
}
if (all.length > 0) {
all.sort((a, b) => a - b);
const mid = Math.floor(all.length / 2);
return Number((all.length % 2 === 0 ? (all[mid - 1] + all[mid]) / 2 : all[mid]).toFixed(2));
// Get the rendered HTML
const html = await page.content();
await page.close();
return { ok: true, html };
} catch (error: any) {
console.error(`Failed to fetch eBay search with Puppeteer: ${error.message}`);
return { ok: false, html: '' };
}
}
/**
* Fetch average price for a SKU by scraping sold/completed listings
* Uses the last 3 sold listings (or 2 or 1 if less available)
* Uses manufacturer + SKU for better search results
*/
export async function fetchMedianSoldPriceUSDForSku(sku: string, manufacturer: string | null = null): Promise<number | null> {
const queries = buildSearchQuery(manufacturer, sku);
for (const q of queries) {
// Get prices from first page (most recent sold listings)
const { ok, html } = await fetchSoldSearchHtml(q, 1);
if (!ok) continue;
const { prices, blocked } = await scrapeSoldPricesUSDPage(html);
if (blocked) {
console.warn(`eBay blocked request for query: ${q}`);
// Try next query instead of giving up
continue;
}
// Take the last 3 sold listings (first 3 prices from the page, as eBay shows most recent first)
const recentPrices = prices.slice(0, 3);
if (recentPrices.length > 0) {
// Calculate average of available prices (3, 2, or 1)
const sum = recentPrices.reduce((a, b) => a + b, 0);
const avg = sum / recentPrices.length;
return Number(avg.toFixed(2));
}
// Small delay before trying next query
if (queries.indexOf(q) < queries.length - 1) {
await new Promise(r => setTimeout(r, 600));
}
}
return null;
}
export async function debugFetchSoldPricesUSDForSku(sku: string) {
const sequences = [`"${sku}"`, sku];
/**
* Debug function to inspect eBay price scraping attempts
*/
export async function debugFetchSoldPricesUSDForSku(sku: string, manufacturer: string | null = null) {
const attempts: any[] = [];
for (const q of sequences) {
const queries = buildSearchQuery(manufacturer, sku);
for (const q of queries) {
let total = 0;
let blocked = false;
const diagnostics: any = { pages: [] };
for (let page = 1; page <= 2; page++) {
for (let page = 1; page <= 3; page++) {
const { ok, html } = await fetchSoldSearchHtml(q, page);
const diag = await scrapeSoldPricesUSDPage(html, true);
diagnostics.pages.push({ page, ok, count: diag.prices.length, selectorHits: diag.diagnostics?.selectorHits, sampleTexts: diag.diagnostics?.sampleTexts, samplePrices: diag.diagnostics?.samplePrices });
// Additional HTML diagnostics
const htmlDiagnostics: any = {
htmlLength: html.length,
containsSold: html.toLowerCase().includes('sold'),
containsResults: html.toLowerCase().includes('results') || html.toLowerCase().includes('result'),
containsLi: html.includes('<li'),
containsUl: html.includes('<ul'),
sampleHtml: html.substring(0, 500), // First 500 chars for inspection
};
diagnostics.pages.push({
page,
ok,
count: diag.prices.length,
selectorHits: diag.diagnostics?.selectorHits,
sampleTexts: diag.diagnostics?.sampleTexts,
samplePrices: diag.diagnostics?.samplePrices,
htmlDiagnostics
});
total += diag.prices.length;
blocked = blocked || diag.blocked;
if (diag.prices.length === 0 && page === 1) {
// No results on first page, try next query
break;
}
// Small delay between pages
if (page < 3) {
await new Promise(r => setTimeout(r, 600));
}
}
attempts.push({ query: q, totalCount: total, blocked, details: diagnostics });
if (total > 0) break;
attempts.push({
method: 'scraping',
query: q,
totalCount: total,
blocked,
details: diagnostics
});
if (total > 0) {
// Found results, but continue to show all attempts in debug
break;
}
}
return { attempts };
return { attempts, manufacturer, sku };
}