perf: enhance scrap navigation review page

This commit is contained in:
Mahen 2026-03-04 07:31:19 +07:00
parent 44698b7c10
commit 7a9ddf2679
1 changed files with 112 additions and 128 deletions

View File

@ -2,134 +2,6 @@ import puppeteer from "puppeteer";
import { ScrapeResult } from "../types";
import { getFallbackData } from "../utils/datas";
export async function scrapeTokopediaProduct(
url: string,
): Promise<ScrapeResult> {
const targetUrl = normalizeToReviewUrl(url);
let browser;
try {
browser = await puppeteer.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--window-size=1366,768",
],
});
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
);
await page.setViewport({ width: 1366, height: 768 });
console.log(`🚀 Scraping URL: ${url}`);
await page.goto(targetUrl, {
waitUntil: "domcontentloaded",
timeout: 60000,
});
await page.evaluate(async () => {
await new Promise<void>((resolve) => {
let totalHeight = 0;
const distance = 150;
const timer = setInterval(() => {
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= 2500) {
clearInterval(timer);
resolve();
}
}, 100);
});
});
await new Promise((r) => setTimeout(r, 3000));
const scrapedData = await page.evaluate(() => {
const titleEl = document.querySelector(
'[data-testid="lblPDPDetailProductName"]',
);
let rawName = titleEl ? titleEl.textContent || "" : document.title;
const cleanProductName = (name: string) => {
let cleaned = name
.replace(
/^(Review|Jual|Promo|Flash Sale|Hot Item|Baru|PROMO)\s+/i,
"",
)
.replace(/^Laptop\s+/i, "")
.replace(/\(.*?\)/g, "")
.replace(/\[.*?\]/g, "")
.trim();
const specWallRegex =
/\b(i[3579]\b|ryzen|celeron|athlon|atom|intel|amd|n\d{4}|z\d{4}|4gb|8gb|12gb|16gb|24gb|32gb|64gb|128gb|256gb|512gb|1tb|2tb|ssd|hdd|emmc|w10|w11|win10|win11|windows|dos|no os|linux|ubuntu|11"|13"|14"|14\.0|15"|15\.6|16"|fhd|hd|qhd|uhd|4k|oled|ips|tn|hz|resmi|garansi|murah)\b/i;
const splitName = cleaned.split(specWallRegex);
let finalName = splitName[0].trim();
finalName = finalName.replace(/[-|,|\/]+\s*$/, "").trim();
return finalName;
};
const productName = cleanProductName(rawName);
const elements = document.querySelectorAll("div, span, p");
const uniqueReviews = new Set<string>();
elements.forEach((el) => {
const text = el.textContent || "";
if (text.length > 40 && text.length < 1000) {
if (el.children.length === 0) {
const cleanText = text.trim().replace(/\n/g, " ");
const isJunk =
cleanText.includes("Lihat Balasan") ||
cleanText.includes("Membantu") ||
cleanText.includes("Laporkan") ||
cleanText.includes("Tokopedia") ||
cleanText.includes("Promo") ||
cleanText.includes("Diskusi");
if (!isJunk) {
uniqueReviews.add(cleanText);
}
}
}
});
return {
productName,
reviews: Array.from(uniqueReviews).slice(0, 15),
};
});
await browser.close();
if (scrapedData.reviews.length === 0) {
console.warn("⚠️ Tidak ada ulasan terdeteksi. Menggunakan Fallback.");
return getFallbackData(url);
}
return {
name: scrapedData.productName,
url: url,
reviews: scrapedData.reviews,
};
} catch (error: any) {
if (browser) await browser.close();
return getFallbackData(url);
}
}
function normalizeToReviewUrl(rawUrl: string): string {
try {
const urlObj = new URL(rawUrl);
@ -149,3 +21,115 @@ function normalizeToReviewUrl(rawUrl: string): string {
return rawUrl;
}
}
export async function scrapeTokopediaProduct(
url: string,
): Promise<ScrapeResult> {
const targetUrl = normalizeToReviewUrl(url);
let browser;
try {
browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
);
await page.setRequestInterception(true);
page.on("request", (req) => {
if (["image", "font", "media"].includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
await page.goto(targetUrl, {
waitUntil: "domcontentloaded",
timeout: 60000,
});
let allReviews: string[] = [];
let rawProductName = "";
const maxPages = 5;
for (let i = 1; i <= maxPages; i++) {
await page
.waitForSelector('[data-testid="lblItemUlasan"]', { timeout: 8000 })
.catch(() => null);
const pageData = await page.evaluate(() => {
const titleEl = document.querySelector(
'[data-testid="lblPDPDetailProductName"]',
);
const name = titleEl?.textContent || document.title;
const reviewElements = document.querySelectorAll(
'[data-testid="lblItemUlasan"]',
);
const reviews: string[] = [];
reviewElements.forEach((el) => {
const text = el.textContent?.trim();
if (text && text.length > 15) {
reviews.push(text.replace(/\n/g, " "));
}
});
return { name, reviews };
});
if (i === 1) rawProductName = pageData.name;
allReviews.push(...pageData.reviews);
const nextBtn = await page.$('button[aria-label^="Laman berikutnya"]');
if (nextBtn && i < maxPages) {
await nextBtn.click();
await new Promise((r) => setTimeout(r, 2000));
} else {
break;
}
}
await browser.close();
if (allReviews.length === 0) return getFallbackData(url);
const finalName = cleanProductNameLogic(rawProductName);
return {
name: finalName,
url: url,
reviews: Array.from(new Set(allReviews)).slice(0, 50),
};
} catch (error) {
if (browser) await browser.close();
console.error("Scraping Error:", error);
return getFallbackData(url);
}
}
function cleanProductNameLogic(name: string): string {
if (!name) return "Unknown Product";
let cleaned = name
.replace(
/^(Review|Jual|Promo|Flash Sale|Hot Item|Baru|PROMO|READY|TERLARIS)\s+/i,
"",
)
.replace(/^Laptop\s+/i, "")
.replace(/\(.*?\)/g, "")
.replace(/\[.*?\]/g, "")
.trim();
const specWallRegex =
/\b(i[3579]\b|ryzen|celeron|athlon|intel|amd|4gb|8gb|16gb|32gb|512gb|1tb|ssd|rtx|gtx|oled|fhd|w10|w11)\b/i;
const splitName = cleaned.split(specWallRegex);
let finalName = splitName[0].trim().replace(/[-|,|\/]+\s*$/, "");
return finalName.length < 5 ? cleaned.substring(0, 35) : finalName;
}