Working Ebay Scraping
All checks were successful
Deploy to Server / deploy (push) Successful in 47s

This commit is contained in:
2025-11-03 16:42:52 -05:00
parent a97e79a7d8
commit 338bdf838b
8 changed files with 1354 additions and 91 deletions

View File

@@ -38,9 +38,12 @@ jobs:
git clone https://git.hudsonriggs.systems/HRiggs/Train-ID.git "$APP_DIR" git clone https://git.hudsonriggs.systems/HRiggs/Train-ID.git "$APP_DIR"
fi fi
cd "$APP_DIR" cd "$APP_DIR"
git pull origin main # Reset any local changes (e.g., package-lock.json, build artifacts) and sync to origin/main
# Install Node.js deps and build git fetch --prune origin
"$NPM" ci || "$NPM" install git reset --hard origin/main
git clean -fdx
# Install Node.js deps and build without modifying lockfile
"$NPM" ci --no-audit --no-fund
"$NPM" run build "$NPM" run build
# Ensure systemd service exists and restart # Ensure systemd service exists and restart
if systemctl list-unit-files | grep -q "${SERVICE}.service"; then if systemctl list-unit-files | grep -q "${SERVICE}.service"; then

View File

@@ -23,6 +23,13 @@ db_port=3306
db_user=trainid db_user=trainid
db_pass=changeme db_pass=changeme
db_name=trainid db_name=trainid
# eBay API (optional - falls back to scraping if not configured)
# Get credentials from https://developer.ebay.com/my/keys
# Create an app and use the Client ID and Client Secret
EBAY_CLIENT_ID=YourAppId...
EBAY_CLIENT_SECRET=YourClientSecret...
EBAY_SANDBOX=false # Set to 'true' to use eBay sandbox environment
``` ```
## Local development ## Local development
@@ -42,6 +49,16 @@ Visit `http://localhost:3000`.
- GET `/api/export/xlsx` → download XLSX export of inventory with embedded thumbnails - GET `/api/export/xlsx` → download XLSX export of inventory with embedded thumbnails
- DELETE `/api/items/:id` → delete one - DELETE `/api/items/:id` → delete one
- DELETE `/api/items` → wipe all - DELETE `/api/items` → wipe all
- GET `/api/debug/ebay-prices?sku=...` → debug eBay price lookup (shows API and scraping attempts)
- POST `/api/prices/update` → update cached prices for all SKUs from eBay
- GET `/api/price-report` → get price report with item values
### eBay Price Checking
The system uses eBay's Browse API to search for items by SKU. **Important limitations:**
- **Browse API only searches active listings**, not sold/completed ones
- For sold/completed listings, the system falls back to web scraping
- In sandbox mode, test data is limited - you may see 0 results even if the API is working correctly
- The API method: `GET /buy/browse/v1/item_summary/search` with query parameter `q` (SKU)
## Debian 13 (Trixie) LXC install ## Debian 13 (Trixie) LXC install
These steps assume a fresh Debian 13 LXC and deployment directory `/opt/Train-ID` with a system user `deployuser` that has passwordless sudo for service management. These steps assume a fresh Debian 13 LXC and deployment directory `/opt/Train-ID` with a system user `deployuser` that has passwordless sudo for service management.

1024
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -18,8 +18,8 @@
"license": "ISC", "license": "ISC",
"type": "module", "type": "module",
"dependencies": { "dependencies": {
"cors": "^2.8.5",
"cheerio": "^1.0.0", "cheerio": "^1.0.0",
"cors": "^2.8.5",
"dotenv": "^17.2.3", "dotenv": "^17.2.3",
"exceljs": "^4.4.0", "exceljs": "^4.4.0",
"express": "^5.1.0", "express": "^5.1.0",
@@ -27,7 +27,8 @@
"morgan": "^1.10.1", "morgan": "^1.10.1",
"mysql2": "^3.15.3", "mysql2": "^3.15.3",
"openai": "^6.7.0", "openai": "^6.7.0",
"pdfkit": "^0.17.2" "pdfkit": "^0.17.2",
"puppeteer": "^24.28.0"
}, },
"devDependencies": { "devDependencies": {
"@types/cheerio": "^0.22.35", "@types/cheerio": "^0.22.35",

View File

@@ -151,6 +151,23 @@ export const db = {
} }
return Array.from(set); return Array.from(set);
}, },
async listDistinctSkuManufacturers() {
await getReady();
// Get distinct SKU + manufacturer pairs
const [rows] = await pool.query(`
select distinct sku, manufacturer
from items
where sku is not null and sku <> '' and manufacturer is not null and manufacturer <> ''
`);
const pairs: Array<{ sku: string; manufacturer: string }> = [];
for (const r of rows as any[]) {
const s = normalizeSku(r.sku as string);
if (s && r.manufacturer) {
pairs.push({ sku: s, manufacturer: r.manufacturer });
}
}
return pairs;
},
async upsertSkuPrice(sku: string, price: number | null, currency: string = 'USD', source: string = 'ebay') { async upsertSkuPrice(sku: string, price: number | null, currency: string = 'USD', source: string = 'ebay') {
await getReady(); await getReady();
const s = normalizeSku(sku); const s = normalizeSku(sku);

View File

@@ -86,6 +86,8 @@
<table id="skuPrices"> <table id="skuPrices">
<thead> <thead>
<tr> <tr>
<th>ID</th>
<th>Name</th>
<th>SKU</th> <th>SKU</th>
<th>Price (USD)</th> <th>Price (USD)</th>
<th class="hide-sm">Updated</th> <th class="hide-sm">Updated</th>
@@ -203,7 +205,9 @@
const tr = document.createElement('tr'); const tr = document.createElement('tr');
const title = s.description || ''; const title = s.description || '';
tr.innerHTML = ` tr.innerHTML = `
<td title="${title.replaceAll('"', '\\"')}">${s.sku}</td> <td>${s.id !== null ? s.id : ''}</td>
<td title="${title.replaceAll('"', '\\"')}">${title || ''}</td>
<td>${s.sku}</td>
<td>${s.price !== null ? `$${s.price.toFixed(2)}` : ''}</td> <td>${s.price !== null ? `$${s.price.toFixed(2)}` : ''}</td>
<td class="hide-sm">${s.updatedAt ? new Date(s.updatedAt).toLocaleString() : ''}</td> <td class="hide-sm">${s.updatedAt ? new Date(s.updatedAt).toLocaleString() : ''}</td>
`; `;

View File

@@ -178,10 +178,14 @@ router.delete('/items', async (_req, res) => {
router.get('/debug/ebay-prices', async (req, res) => { router.get('/debug/ebay-prices', async (req, res) => {
try { try {
const skuParam = typeof req.query.sku === 'string' ? req.query.sku : ''; const skuParam = typeof req.query.sku === 'string' ? req.query.sku : '';
const manufacturerParam = typeof req.query.manufacturer === 'string' ? req.query.manufacturer : null;
const norm = normalizeSku(skuParam); const norm = normalizeSku(skuParam);
if (!norm) return res.status(400).json({ error: 'sku required' }); if (!norm) return res.status(400).json({ error: 'sku required' });
const details = await debugFetchSoldPricesUSDForSku(norm); const details = await debugFetchSoldPricesUSDForSku(norm, manufacturerParam);
res.json({ sku: norm, ...details }); res.json({
...details,
note: 'Scraping sold/completed listings using manufacturer + SKU for better search results.',
});
} catch (err: any) { } catch (err: any) {
console.error(err); console.error(err);
res.status(500).json({ error: err.message || 'Failed to debug ebay prices' }); res.status(500).json({ error: err.message || 'Failed to debug ebay prices' });
@@ -191,22 +195,42 @@ router.get('/debug/ebay-prices', async (req, res) => {
// Update cached prices for all distinct SKUs (on-demand only) // Update cached prices for all distinct SKUs (on-demand only)
router.post('/prices/update', async (_req, res) => { router.post('/prices/update', async (_req, res) => {
try { try {
const skus = await db.listDistinctSkus(); // Get SKU + manufacturer pairs for better search results
const skuManufacturers = await db.listDistinctSkuManufacturers();
const out: Record<string, number | null> = {}; const out: Record<string, number | null> = {};
for (const sku of skus) {
for (const { sku, manufacturer } of skuManufacturers) {
try { try {
const norm = normalizeSku(sku); const norm = normalizeSku(sku);
if (!norm) { out[sku] = null; continue; } if (!norm) { out[sku] = null; continue; }
const price = await fetchMedianSoldPriceUSDForSku(norm); const price = await fetchMedianSoldPriceUSDForSku(norm, manufacturer);
await db.upsertSkuPrice(norm, price, 'USD', 'ebay'); await db.upsertSkuPrice(norm, price, 'USD', 'ebay');
out[norm] = price; out[norm] = price;
// small delay to be gentle (reduce block risk) // small delay to be gentle (reduce block risk)
await new Promise(r => setTimeout(r, 900)); await new Promise(r => setTimeout(r, 1200));
} catch (e) { } catch (e) {
console.error('Failed to update price for', sku, e); console.error('Failed to update price for', sku, e);
out[sku] = null; out[sku] = null;
} }
} }
// Also handle SKUs without manufacturer (fallback)
const skusWithoutMfr = await db.listDistinctSkus();
for (const sku of skusWithoutMfr) {
if (out[sku] !== undefined) continue; // Already processed
try {
const norm = normalizeSku(sku);
if (!norm) { out[sku] = null; continue; }
const price = await fetchMedianSoldPriceUSDForSku(norm, null);
await db.upsertSkuPrice(norm, price, 'USD', 'ebay');
out[norm] = price;
await new Promise(r => setTimeout(r, 1200));
} catch (e) {
console.error('Failed to update price for', sku, e);
out[sku] = null;
}
}
res.json({ updated: out }); res.json({ updated: out });
} catch (err: any) { } catch (err: any) {
console.error(err); console.error(err);
@@ -236,8 +260,8 @@ router.get('/price-report', async (_req, res) => {
}; };
}); });
// Build sku list with one description (first encountered) // Build sku list with one description and ID (first encountered)
const skuListMap: Record<string, { sku: string, price: number | null, currency: string, updatedAt: string | null, description: string } > = {}; const skuListMap: Record<string, { sku: string, price: number | null, currency: string, updatedAt: string | null, description: string, id: number | null } > = {};
for (const it of items) { for (const it of items) {
const key = normalizeSku(it.sku); const key = normalizeSku(it.sku);
if (!key) continue; if (!key) continue;
@@ -248,7 +272,8 @@ router.get('/price-report', async (_req, res) => {
price: p ? (p.price !== null ? Number(p.price) : null) : null, price: p ? (p.price !== null ? Number(p.price) : null) : null,
currency: p ? p.currency : 'USD', currency: p ? p.currency : 'USD',
updatedAt: p ? p.updatedAt : null, updatedAt: p ? p.updatedAt : null,
description: it.description description: it.description,
id: it.id
}; };
} }
} }

View File

@@ -1,5 +1,7 @@
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import puppeteer from 'puppeteer';
// Scraping result type
type ScrapeResult = { type ScrapeResult = {
prices: number[]; prices: number[];
blocked: boolean; blocked: boolean;
@@ -13,105 +15,285 @@ async function scrapeSoldPricesUSDPage(html: string, wantDiagnostics = false): P
const sampleTexts: string[] = []; const sampleTexts: string[] = [];
const samplePrices: number[] = []; const samplePrices: number[] = [];
const blocked = html.includes('To continue, please verify') || html.toLowerCase().includes('robot check'); // Detect various eBay bot detection pages
const blocked = html.includes('To continue, please verify')
|| html.toLowerCase().includes('robot check')
|| html.includes('Pardon Our Interruption')
|| html.includes('access to this page has been denied')
|| (html.length < 50000 && html.includes('ebaystatic.com') && !html.includes('srp-results')); // Suspiciously small HTML with eBay assets but no results
// Try multiple listing item selectors (eBay uses different structures)
const listingSelectors = [
'li.s-item',
'li[class*="s-item"]',
'ul.srp-results li',
'div[class*="srp-item"]',
'li[data-view*="item"]',
];
const priceSelectors = [ const priceSelectors = [
'.s-item__price', '.s-item__price',
'.s-item__detail--primary .s-item__price', '.s-item__detail--primary .s-item__price',
'span[class*="s-item__price"]', 'span[class*="s-item__price"]',
'span[class*="price"]',
'div[class*="price"]',
]; ];
$('li.s-item').each((_i, el) => { // Try each listing selector
const $el = $(el); let foundListings = false;
let text: string | null = null; for (const listingSel of listingSelectors) {
for (const sel of priceSelectors) { const listings = $(listingSel);
const t = $el.find(sel).first().text().trim(); if (listings.length > 0) {
if (t) { foundListings = true;
text = t; selectorHits[sel] = (selectorHits[sel] || 0) + 1; break; if (wantDiagnostics) selectorHits[`listing_${listingSel}`] = listings.length;
listings.each((_i, el) => {
const $el = $(el);
let text: string | null = null;
// Try price selectors
for (const sel of priceSelectors) {
const t = $el.find(sel).first().text().trim();
if (t) {
text = t;
selectorHits[`price_${sel}`] = (selectorHits[`price_${sel}`] || 0) + 1;
break;
}
}
// Regex fallback: look for price patterns in the listing HTML
if (!text) {
const htmlFrag = $el.html() || '';
const priceMatches = htmlFrag.match(/\$\s*[0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?/g);
if (priceMatches && priceMatches.length > 0) {
// Take the first price match, but prefer ones that look like sold prices
text = priceMatches[0];
selectorHits['price_regex'] = (selectorHits['price_regex'] || 0) + 1;
}
}
if (!text) return;
if (wantDiagnostics && sampleTexts.length < 10) sampleTexts.push(text);
if (!text.includes('$')) return;
// Extract price value
const single = text.split(' to ')[0].split(' ')[0]; // Handle ranges and extra text
const num = single.replace(/[^0-9.]/g, '');
if (!num) return;
const value = Number(num);
if (!Number.isFinite(value) || value <= 0 || value > 1000000) return; // Sanity check
prices.push(value);
if (wantDiagnostics && samplePrices.length < 10) samplePrices.push(value);
});
if (prices.length > 0) break; // Found prices, no need to try other listing selectors
}
}
// If no listings found with standard selectors, try broad regex search as last resort
if (!foundListings && prices.length === 0 && !blocked) {
const priceMatches = html.match(/\$\s*[0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?/g);
if (priceMatches) {
for (const match of priceMatches.slice(0, 20)) { // Limit to first 20 matches
const num = match.replace(/[^0-9.]/g, '');
const value = Number(num);
if (Number.isFinite(value) && value > 0 && value < 1000000) {
prices.push(value);
if (wantDiagnostics && samplePrices.length < 10) samplePrices.push(value);
}
} }
if (wantDiagnostics) selectorHits['fallback_regex'] = priceMatches.length;
} }
if (!text) { }
// regex fallback within this listing's HTML
const htmlFrag = $el.html() || '';
const m = htmlFrag.match(/\$\s*[0-9]{1,3}(?:,[0-9]{3})*(?:\.[0-9]{2})?/);
if (m) text = m[0];
}
if (!text) return;
if (wantDiagnostics && sampleTexts.length < 10) sampleTexts.push(text);
if (!text.includes('$')) return;
const single = text.split(' to ')[0];
const num = single.replace(/[^0-9.]/g, '');
if (!num) return;
const value = Number(num);
if (!Number.isFinite(value) || value <= 0) return;
prices.push(value);
if (wantDiagnostics && samplePrices.length < 10) samplePrices.push(value);
});
return { prices, blocked, diagnostics: wantDiagnostics ? { selectorHits, sampleTexts, samplePrices } : undefined }; return { prices, blocked, diagnostics: wantDiagnostics ? { selectorHits, sampleTexts, samplePrices } : undefined };
} }
async function fetchSoldSearchHtml(query: string, page = 1): Promise<{ ok: boolean; html: string }> { /**
const url = `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}&LH_Sold=1&LH_Complete=1&rt=nc&_ipg=200&_pgn=${page}`; * Build search query combining manufacturer and SKU
const res = await fetch(url, { */
headers: { function buildSearchQuery(manufacturer: string | null, sku: string): string[] {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', const queries: string[] = [];
'Accept-Language': 'en-US,en;q=0.9'
} as any if (manufacturer && manufacturer.trim()) {
} as any); const cleanMfr = manufacturer.trim();
const html = await res.text(); const cleanSku = sku.trim();
return { ok: res.ok, html };
// Try various combinations for better matching
queries.push(`"${cleanMfr}" "${cleanSku}"`); // Exact match both
queries.push(`${cleanMfr} ${cleanSku}`); // Both without quotes
queries.push(`"${cleanMfr} ${cleanSku}"`); // Combined exact match
queries.push(`${cleanMfr} "${cleanSku}"`); // Mfr loose, SKU exact
}
// Fallback to SKU only if no manufacturer
queries.push(`"${sku}"`);
queries.push(sku);
return queries;
} }
export async function fetchMedianSoldPriceUSDForSku(sku: string): Promise<number | null> { // Puppeteer browser instance (reused across requests)
// Try quoted exact search first let browserInstance: any = null;
const tryQueries = [
`"${sku}"`, /**
sku // fallback without quotes * Get or create Puppeteer browser instance
]; */
for (const q of tryQueries) { async function getBrowser() {
let all: number[] = []; if (!browserInstance) {
for (let page = 1; page <= 2; page++) { browserInstance = await puppeteer.launch({
const { ok, html } = await fetchSoldSearchHtml(q, page); headless: true,
if (!ok) continue; args: [
const { prices, blocked } = await scrapeSoldPricesUSDPage(html); '--no-sandbox',
if (blocked) return null; '--disable-setuid-sandbox',
all = all.concat(prices); '--disable-dev-shm-usage',
if (all.length === 0) { '--disable-blink-features=AutomationControlled',
// small delay before next page to be gentle ],
await new Promise(r => setTimeout(r, 600)); });
} }
return browserInstance;
}
/**
* Fetch sold/completed listings HTML from eBay search using Puppeteer
*/
async function fetchSoldSearchHtml(query: string, pageNum = 1): Promise<{ ok: boolean; html: string }> {
const url = `https://www.ebay.com/sch/i.html?_nkw=${encodeURIComponent(query)}&LH_Sold=1&LH_Complete=1&rt=nc&_ipg=200&_pgn=${pageNum}`;
try {
const browser = await getBrowser();
const page = await browser.newPage();
// Set realistic viewport and user agent
await page.setViewport({ width: 1920, height: 1080 });
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36');
// Navigate to the search page
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
// Wait for listings to load (check for s-item elements or price elements)
try {
await page.waitForSelector('li.s-item, ul.srp-results li, [class*="s-item"]', { timeout: 10000 });
} catch (e) {
// Listings might not have loaded, but continue anyway
} }
if (all.length > 0) {
all.sort((a, b) => a - b); // Get the rendered HTML
const mid = Math.floor(all.length / 2); const html = await page.content();
return Number((all.length % 2 === 0 ? (all[mid - 1] + all[mid]) / 2 : all[mid]).toFixed(2)); await page.close();
return { ok: true, html };
} catch (error: any) {
console.error(`Failed to fetch eBay search with Puppeteer: ${error.message}`);
return { ok: false, html: '' };
}
}
/**
* Fetch average price for a SKU by scraping sold/completed listings
* Uses the last 3 sold listings (or 2 or 1 if less available)
* Uses manufacturer + SKU for better search results
*/
export async function fetchMedianSoldPriceUSDForSku(sku: string, manufacturer: string | null = null): Promise<number | null> {
const queries = buildSearchQuery(manufacturer, sku);
for (const q of queries) {
// Get prices from first page (most recent sold listings)
const { ok, html } = await fetchSoldSearchHtml(q, 1);
if (!ok) continue;
const { prices, blocked } = await scrapeSoldPricesUSDPage(html);
if (blocked) {
console.warn(`eBay blocked request for query: ${q}`);
// Try next query instead of giving up
continue;
}
// Take the last 3 sold listings (first 3 prices from the page, as eBay shows most recent first)
const recentPrices = prices.slice(0, 3);
if (recentPrices.length > 0) {
// Calculate average of available prices (3, 2, or 1)
const sum = recentPrices.reduce((a, b) => a + b, 0);
const avg = sum / recentPrices.length;
return Number(avg.toFixed(2));
}
// Small delay before trying next query
if (queries.indexOf(q) < queries.length - 1) {
await new Promise(r => setTimeout(r, 600));
} }
} }
return null; return null;
} }
export async function debugFetchSoldPricesUSDForSku(sku: string) { /**
const sequences = [`"${sku}"`, sku]; * Debug function to inspect eBay price scraping attempts
*/
export async function debugFetchSoldPricesUSDForSku(sku: string, manufacturer: string | null = null) {
const attempts: any[] = []; const attempts: any[] = [];
for (const q of sequences) { const queries = buildSearchQuery(manufacturer, sku);
for (const q of queries) {
let total = 0; let total = 0;
let blocked = false; let blocked = false;
const diagnostics: any = { pages: [] }; const diagnostics: any = { pages: [] };
for (let page = 1; page <= 2; page++) {
for (let page = 1; page <= 3; page++) {
const { ok, html } = await fetchSoldSearchHtml(q, page); const { ok, html } = await fetchSoldSearchHtml(q, page);
const diag = await scrapeSoldPricesUSDPage(html, true); const diag = await scrapeSoldPricesUSDPage(html, true);
diagnostics.pages.push({ page, ok, count: diag.prices.length, selectorHits: diag.diagnostics?.selectorHits, sampleTexts: diag.diagnostics?.sampleTexts, samplePrices: diag.diagnostics?.samplePrices });
// Additional HTML diagnostics
const htmlDiagnostics: any = {
htmlLength: html.length,
containsSold: html.toLowerCase().includes('sold'),
containsResults: html.toLowerCase().includes('results') || html.toLowerCase().includes('result'),
containsLi: html.includes('<li'),
containsUl: html.includes('<ul'),
sampleHtml: html.substring(0, 500), // First 500 chars for inspection
};
diagnostics.pages.push({
page,
ok,
count: diag.prices.length,
selectorHits: diag.diagnostics?.selectorHits,
sampleTexts: diag.diagnostics?.sampleTexts,
samplePrices: diag.diagnostics?.samplePrices,
htmlDiagnostics
});
total += diag.prices.length; total += diag.prices.length;
blocked = blocked || diag.blocked; blocked = blocked || diag.blocked;
if (diag.prices.length === 0 && page === 1) { if (diag.prices.length === 0 && page === 1) {
// No results on first page, try next query
break;
}
// Small delay between pages
if (page < 3) {
await new Promise(r => setTimeout(r, 600)); await new Promise(r => setTimeout(r, 600));
} }
} }
attempts.push({ query: q, totalCount: total, blocked, details: diagnostics });
if (total > 0) break; attempts.push({
method: 'scraping',
query: q,
totalCount: total,
blocked,
details: diagnostics
});
if (total > 0) {
// Found results, but continue to show all attempts in debug
break;
}
} }
return { attempts };
return { attempts, manufacturer, sku };
} }