perf: enhance scrap navigation review page
This commit is contained in:
parent
44698b7c10
commit
7a9ddf2679
|
|
@ -2,134 +2,6 @@ import puppeteer from "puppeteer";
|
|||
import { ScrapeResult } from "../types";
|
||||
import { getFallbackData } from "../utils/datas";
|
||||
|
||||
export async function scrapeTokopediaProduct(
|
||||
url: string,
|
||||
): Promise<ScrapeResult> {
|
||||
const targetUrl = normalizeToReviewUrl(url);
|
||||
|
||||
let browser;
|
||||
|
||||
try {
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--window-size=1366,768",
|
||||
],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||
);
|
||||
await page.setViewport({ width: 1366, height: 768 });
|
||||
|
||||
console.log(`🚀 Scraping URL: ${url}`);
|
||||
|
||||
await page.goto(targetUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
await page.evaluate(async () => {
|
||||
await new Promise<void>((resolve) => {
|
||||
let totalHeight = 0;
|
||||
const distance = 150;
|
||||
const timer = setInterval(() => {
|
||||
window.scrollBy(0, distance);
|
||||
totalHeight += distance;
|
||||
if (totalHeight >= 2500) {
|
||||
clearInterval(timer);
|
||||
resolve();
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
});
|
||||
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const scrapedData = await page.evaluate(() => {
|
||||
const titleEl = document.querySelector(
|
||||
'[data-testid="lblPDPDetailProductName"]',
|
||||
);
|
||||
let rawName = titleEl ? titleEl.textContent || "" : document.title;
|
||||
|
||||
const cleanProductName = (name: string) => {
|
||||
let cleaned = name
|
||||
.replace(
|
||||
/^(Review|Jual|Promo|Flash Sale|Hot Item|Baru|PROMO)\s+/i,
|
||||
"",
|
||||
)
|
||||
.replace(/^Laptop\s+/i, "")
|
||||
.replace(/\(.*?\)/g, "")
|
||||
.replace(/\[.*?\]/g, "")
|
||||
.trim();
|
||||
|
||||
const specWallRegex =
|
||||
/\b(i[3579]\b|ryzen|celeron|athlon|atom|intel|amd|n\d{4}|z\d{4}|4gb|8gb|12gb|16gb|24gb|32gb|64gb|128gb|256gb|512gb|1tb|2tb|ssd|hdd|emmc|w10|w11|win10|win11|windows|dos|no os|linux|ubuntu|11"|13"|14"|14\.0|15"|15\.6|16"|fhd|hd|qhd|uhd|4k|oled|ips|tn|hz|resmi|garansi|murah)\b/i;
|
||||
|
||||
const splitName = cleaned.split(specWallRegex);
|
||||
|
||||
let finalName = splitName[0].trim();
|
||||
|
||||
finalName = finalName.replace(/[-|,|\/]+\s*$/, "").trim();
|
||||
|
||||
return finalName;
|
||||
};
|
||||
|
||||
const productName = cleanProductName(rawName);
|
||||
|
||||
const elements = document.querySelectorAll("div, span, p");
|
||||
const uniqueReviews = new Set<string>();
|
||||
|
||||
elements.forEach((el) => {
|
||||
const text = el.textContent || "";
|
||||
|
||||
if (text.length > 40 && text.length < 1000) {
|
||||
if (el.children.length === 0) {
|
||||
const cleanText = text.trim().replace(/\n/g, " ");
|
||||
|
||||
const isJunk =
|
||||
cleanText.includes("Lihat Balasan") ||
|
||||
cleanText.includes("Membantu") ||
|
||||
cleanText.includes("Laporkan") ||
|
||||
cleanText.includes("Tokopedia") ||
|
||||
cleanText.includes("Promo") ||
|
||||
cleanText.includes("Diskusi");
|
||||
|
||||
if (!isJunk) {
|
||||
uniqueReviews.add(cleanText);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
productName,
|
||||
reviews: Array.from(uniqueReviews).slice(0, 15),
|
||||
};
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
if (scrapedData.reviews.length === 0) {
|
||||
console.warn("⚠️ Tidak ada ulasan terdeteksi. Menggunakan Fallback.");
|
||||
return getFallbackData(url);
|
||||
}
|
||||
|
||||
return {
|
||||
name: scrapedData.productName,
|
||||
url: url,
|
||||
reviews: scrapedData.reviews,
|
||||
};
|
||||
} catch (error: any) {
|
||||
if (browser) await browser.close();
|
||||
return getFallbackData(url);
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeToReviewUrl(rawUrl: string): string {
|
||||
try {
|
||||
const urlObj = new URL(rawUrl);
|
||||
|
|
@ -149,3 +21,115 @@ function normalizeToReviewUrl(rawUrl: string): string {
|
|||
return rawUrl;
|
||||
}
|
||||
}
|
||||
|
||||
export async function scrapeTokopediaProduct(
|
||||
url: string,
|
||||
): Promise<ScrapeResult> {
|
||||
const targetUrl = normalizeToReviewUrl(url);
|
||||
let browser;
|
||||
|
||||
try {
|
||||
browser = await puppeteer.launch({
|
||||
headless: false,
|
||||
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
||||
);
|
||||
|
||||
await page.setRequestInterception(true);
|
||||
page.on("request", (req) => {
|
||||
if (["image", "font", "media"].includes(req.resourceType())) {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
|
||||
await page.goto(targetUrl, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
let allReviews: string[] = [];
|
||||
let rawProductName = "";
|
||||
const maxPages = 5;
|
||||
|
||||
for (let i = 1; i <= maxPages; i++) {
|
||||
await page
|
||||
.waitForSelector('[data-testid="lblItemUlasan"]', { timeout: 8000 })
|
||||
.catch(() => null);
|
||||
|
||||
const pageData = await page.evaluate(() => {
|
||||
const titleEl = document.querySelector(
|
||||
'[data-testid="lblPDPDetailProductName"]',
|
||||
);
|
||||
const name = titleEl?.textContent || document.title;
|
||||
|
||||
const reviewElements = document.querySelectorAll(
|
||||
'[data-testid="lblItemUlasan"]',
|
||||
);
|
||||
const reviews: string[] = [];
|
||||
|
||||
reviewElements.forEach((el) => {
|
||||
const text = el.textContent?.trim();
|
||||
if (text && text.length > 15) {
|
||||
reviews.push(text.replace(/\n/g, " "));
|
||||
}
|
||||
});
|
||||
|
||||
return { name, reviews };
|
||||
});
|
||||
|
||||
if (i === 1) rawProductName = pageData.name;
|
||||
|
||||
allReviews.push(...pageData.reviews);
|
||||
|
||||
const nextBtn = await page.$('button[aria-label^="Laman berikutnya"]');
|
||||
if (nextBtn && i < maxPages) {
|
||||
await nextBtn.click();
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
|
||||
if (allReviews.length === 0) return getFallbackData(url);
|
||||
|
||||
const finalName = cleanProductNameLogic(rawProductName);
|
||||
|
||||
return {
|
||||
name: finalName,
|
||||
url: url,
|
||||
reviews: Array.from(new Set(allReviews)).slice(0, 50),
|
||||
};
|
||||
} catch (error) {
|
||||
if (browser) await browser.close();
|
||||
console.error("Scraping Error:", error);
|
||||
return getFallbackData(url);
|
||||
}
|
||||
}
|
||||
|
||||
function cleanProductNameLogic(name: string): string {
|
||||
if (!name) return "Unknown Product";
|
||||
let cleaned = name
|
||||
.replace(
|
||||
/^(Review|Jual|Promo|Flash Sale|Hot Item|Baru|PROMO|READY|TERLARIS)\s+/i,
|
||||
"",
|
||||
)
|
||||
.replace(/^Laptop\s+/i, "")
|
||||
.replace(/\(.*?\)/g, "")
|
||||
.replace(/\[.*?\]/g, "")
|
||||
.trim();
|
||||
|
||||
const specWallRegex =
|
||||
/\b(i[3579]\b|ryzen|celeron|athlon|intel|amd|4gb|8gb|16gb|32gb|512gb|1tb|ssd|rtx|gtx|oled|fhd|w10|w11)\b/i;
|
||||
const splitName = cleaned.split(specWallRegex);
|
||||
let finalName = splitName[0].trim().replace(/[-|,|\/]+\s*$/, "");
|
||||
|
||||
return finalName.length < 5 ? cleaned.substring(0, 35) : finalName;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue