chore: cleaning the code

This commit is contained in:
Mahen 2026-04-10 11:26:03 +07:00
parent 817924bd8c
commit 3613b1a120
11 changed files with 12 additions and 187 deletions

2
run.py
View File

@ -3,10 +3,8 @@ import sys
import uvicorn
if __name__ == "__main__":
# Paksa penggunaan SelectorEventLoop di level paling dasar OS Windows
if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
print("✅ Mesin Selector Loop Aktif (Anti-NotImplementedError)")
# Jalankan uvicorn dari sini, bukan dari terminal langsung
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)

View File

@ -11,7 +11,6 @@ def clean_product_name(name: str) -> str:
return name.strip()
async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request):
# 1. SETUP ASPEK (Initialize score 0 untuk setiap kategori)
aspect_stats = {
aspect: {"positive": 0, "total": 0}
for aspect in config.ASPECT_KEYWORDS.keys()
@ -19,7 +18,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
print(f"🔍 Memulai Analisis ABSA: {candidate.name[:30]}...")
# 2. DATABASE PRE-CHECK (Model & User)
model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"})
if not model_db:
print("❌ ERROR: Model XGBoost tidak ditemukan di database!")
@ -30,7 +28,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
print(f"⚠️ User {user_email} tidak ditemukan!")
return None
# 3. PRODUCT PERSISTENCE
brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown"
product_name = clean_product_name(candidate.name)
@ -49,7 +46,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
}
)
# 4. NLP PREDICTION & ASPECT TAGGING LOOP
total_reviews = len(candidate.reviews)
if total_reviews == 0: return None
@ -69,7 +65,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
pred_idx = ml_core.model_optimized.predict(vec)[0]
label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower()
# Confidence Score dari XGBoost
try:
prob = ml_core.model_optimized.predict_proba(vec)[0]
confidence_score = float(max(prob))
@ -107,13 +102,11 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
"userId": user_db.id
})
# 5. DATABASE SYNC (Batch Operations)
if reviews_data_to_save:
async with prisma.tx() as transaction:
await transaction.review.delete_many(where={"productId": product_db.productId})
await transaction.review.create_many(data=reviews_data_to_save)
# 6. CALCULATION & VERDICT GENERATION
final_aspect_scores = {}
for aspect, stat in aspect_stats.items():
score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0
@ -140,14 +133,12 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
else:
verdict_label = "Kurang Disarankan"
# 1. Buat Analysis terlebih dahulu untuk mendapatkan ID-nya
new_analysis = await prisma.analysis.create(
data={
"userId": user_db.id,
}
)
# 2. Buat Metric dan hubungkan ke Analysis yang baru saja dibuat
await prisma.metric.create(
data={
"generalSentiment": general_sentiment_pct,

View File

@ -5,9 +5,6 @@ from pathlib import Path
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
# ==========================================
# 1. KONFIGURASI PATH (PATHLIB)
# ==========================================
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
DATA_DIR = PROJECT_ROOT / "robust_data"
@ -20,9 +17,6 @@ PATHS = {
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
}
# ==========================================
# 2. LOAD DATA
# ==========================================
print("\n--- MEMUAT DATA BASELINE ---")
data = {}
@ -47,11 +41,6 @@ le = data['le']
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
# ==========================================
# 3. SETUP MODEL BASELINE
# ==========================================
# Tanpa Grid Search, menggunakan settingan default XGBoost
# Default XGBoost biasanya: learning_rate=0.3, max_depth=6, n_estimators=100
model_baseline = XGBClassifier(
objective='multi:softprob',
num_class=3,
@ -60,9 +49,6 @@ model_baseline = XGBClassifier(
use_label_encoder=False
)
# ==========================================
# 4. EKSEKUSI TRAINING
# ==========================================
print("\n🔥 MULAI TRAINING BASELINE (SCENARIO 1)...")
start_time = time.time()
@ -71,31 +57,21 @@ model_baseline.fit(X_train, y_train)
duration = time.time() - start_time
print(f"\n✅ SELESAI! Waktu proses: {duration:.2f} detik")
# ==========================================
# 5. MENAMPILKAN PARAMETER DEFAULT (BARU)
# ==========================================
# Karena tidak pakai GridSearch, kita ambil parameter langsung dari modelnya
print("\n" + "="*40)
print("PARAMETER YANG DIGUNAKAN (DEFAULT)")
print("="*40)
# Mengambil seluruh parameter model
all_params = model_baseline.get_params()
# Kita filter hanya parameter penting untuk dibandingkan dengan Skenario 2 & 3
key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree']
shown_params = {k: all_params.get(k) for k in key_params}
# Jika n_estimators atau learning_rate None (karena default library), kita set nilai standarnya manual untuk info
if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)"
if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default"
print(shown_params)
print("(Gunakan nilai ini untuk perbandingan di Bab 4)")
# ==========================================
# 6. EVALUASI & SIMPAN
# ==========================================
print("\n" + "="*40)
print("HASIL SKENARIO 1 (BASELINE)")
print("="*40)
@ -111,7 +87,6 @@ print(classification_report(y_test_label, y_pred_label))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label, y_pred_label))
# Simpan Model Baseline
model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl'
joblib.dump(model_baseline, model_path)
print(f"\n💾 Model baseline disimpan ke: {model_path}")

View File

@ -6,9 +6,6 @@ from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
# ==========================================
# 1. KONFIGURASI PATH (PATHLIB)
# ==========================================
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
DATA_DIR = PROJECT_ROOT / "robust_data"
@ -21,9 +18,6 @@ PATHS = {
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
}
# ==========================================
# 2. LOAD DATA
# ==========================================
print("\n--- MEMUAT DATA SCENARIO 2 ---")
data = {}
@ -46,11 +40,6 @@ le = data['le']
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
# ==========================================
# 3. SETUP GRID SEARCH (SAMA DENGAN SKENARIO 3)
# ==========================================
# Kita gunakan range parameter yang SAMA PERSIS dengan Skenario 3
# agar perbandingannya adil (apple-to-apple).
param_grid = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
@ -67,7 +56,6 @@ xgb = XGBClassifier(
use_label_encoder=False
)
# Gunakan F1-Macro agar Grid Search mencoba adil ke kelas minoritas
grid_search = GridSearchCV(
estimator=xgb,
param_grid=param_grid,
@ -77,9 +65,6 @@ grid_search = GridSearchCV(
verbose=1
)
# ==========================================
# 4. EKSEKUSI TRAINING
# ==========================================
print("\n🔥 MULAI TRAINING & GRID SEARCH (SCENARIO 2)...")
print("Sedang mencari parameter terbaik untuk data Imbalanced...")
start_time = time.time()
@ -89,9 +74,6 @@ grid_search.fit(X_train, y_train)
duration = time.time() - start_time
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
# ==========================================
# 5. EVALUASI & SIMPAN
# ==========================================
best_model = grid_search.best_estimator_
print("\n" + "="*40)
@ -114,7 +96,6 @@ print(classification_report(y_test_label, y_pred_label))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label, y_pred_label))
# Simpan Model Skenario 2
model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl'
joblib.dump(best_model, model_path)
print(f"\n💾 Model Skenario 2 disimpan ke: {model_path}")

View File

@ -11,9 +11,6 @@ from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
# ==========================================
# KONFIGURASI PATH
# ==========================================
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
DATA_DIR = PROJECT_ROOT / "robust_data"
@ -28,7 +25,6 @@ PATHS = {
print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---")
# Load Data
data = {}
for name, path in PATHS.items():
if not path.exists():
@ -41,9 +37,6 @@ X_train, y_train = data["X_train"], data["y_train"]
X_test, y_test = data["X_test"], data["y_test"]
le = data["le"]
# ==========================================
# REPORT PROPORSI DATA (SEBELUM & SESUDAH SMOTE)
# ==========================================
print("\n" + "="*40)
print("REPORT PROPORSI DATA")
print("="*40)
@ -57,16 +50,12 @@ def print_proportion(y, title):
print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)")
# Simulasi SMOTE untuk melihat hasil akhir yang akan diproses Pipeline
sm_sim = SMOTE(random_state=42)
_, y_resampled_sim = sm_sim.fit_resample(X_train, y_train)
print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)")
print("\n" + "="*40)
# ==========================================
# DEFINISI PIPELINE
# ==========================================
pipeline = ImbPipeline([
('smote', SMOTE(random_state=42)),
('selector', SelectKBest(score_func=chi2, k=2000)),
@ -79,9 +68,6 @@ pipeline = ImbPipeline([
))
])
# ==========================================
# SETTING GRID SEARCH
# ==========================================
# param_grid = {
# 'clf__learning_rate': [0.1, 0.2],
# 'clf__max_depth': [5, 7],
@ -106,9 +92,6 @@ grid_search = GridSearchCV(
verbose=2
)
# ==========================================
# EKSEKUSI TRAINING
# ==========================================
print(f"\n🔥 MULAI TRAINING... (Dimensi Awal: {X_train.shape})")
start_time = time.time()
@ -117,9 +100,6 @@ grid_search.fit(X_train, y_train)
duration = time.time() - start_time
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
# ==========================================
# EVALUASI
# ==========================================
best_model = grid_search.best_estimator_
print("\n" + "="*40)
@ -129,7 +109,6 @@ print(grid_search.best_params_)
y_pred = best_model.predict(X_test)
# Inverse Transform Label
y_test_label = le.inverse_transform(y_test)
y_pred_label = le.inverse_transform(y_pred)
@ -139,9 +118,6 @@ print(classification_report(y_test_label, y_pred_label))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label, y_pred_label))
# ==========================================
# SIMPAN MODEL
# ==========================================
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True)
model_path = MODEL_DIR / "xgboost_scenario3.pkl"

View File

@ -2,82 +2,58 @@ import joblib
import os
from sklearn.feature_selection import SelectKBest, chi2
# ==========================================
# KONFIGURASI
# ==========================================
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
# Input (Kita butuh Data Train hasil SMOTE dan Data Test asli)
input_X_train = 'new_X_train_smote.pkl'
input_y_train = 'new_y_train_smote.pkl'
input_X_test = 'X_test_tfidf.pkl' # Test set asli (belum diapa-apakan selain TFIDF)
input_X_test = 'X_test_tfidf.pkl'
# Output
# output_X_train = 'data/chi2/X_train_chi2.pkl'
# output_X_test = 'data/chi2/X_test_chi2.pkl'
# output_selector = 'data/chi2/chisquare_selector.pkl' # Simpan logikanya
output_X_train = 'X_train_chi2.pkl'
output_X_test = 'X_test_chi2.pkl'
output_selector = 'chisquare_selector.pkl' # Simpan logikanya
output_selector = 'chisquare_selector.pkl'
# JUMLAH FITUR YANG INGIN DIAMBIL (Parameter K)
# Silakan ubah angka ini. 1000 adalah angka start yang bagus untuk Skripsi S1.
# Jika fitur awal Anda < 1000, ubah jadi 'all' atau angka lebih kecil (misal 500).
K_FEATURES = 1000
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
try:
# 1. Load Data
print("1. Memuat data...")
# Load Train (SMOTE)
X_train = joblib.load(os.path.join(base_dir, input_X_train))
y_train = joblib.load(os.path.join(base_dir, input_y_train))
# Load Test (TF-IDF Asli)
# Kita butuh ini agar dimensi Test sama dengan Train nanti
X_test = joblib.load(os.path.join(base_dir, input_X_test))
print(f" - Dimensi Awal Train: {X_train.shape}")
print(f" - Dimensi Awal Test: {X_test.shape}")
# Cek jumlah fitur total
total_features = X_train.shape[1]
print(f" - Total kata/fitur saat ini: {total_features}")
# Validasi K
if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
k_final = 'all'
else:
k_final = K_FEATURES
# 2. Proses Chi-Square
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
# Inisialisasi SelectKBest dengan skor func chi2
selector = SelectKBest(score_func=chi2, k=k_final)
# FIT hanya pada Data Train! (Pelajari mana kata penting dari data latih)
selector.fit(X_train, y_train)
# TRANSFORM pada Train DAN Test
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
# 3. Validasi Hasil
print("\n3. Hasil Seleksi:")
print(f" - Dimensi Train Baru: {X_train_selected.shape}")
print(f" - Dimensi Test Baru: {X_test_selected.shape}")
# Menampilkan beberapa skor fitur (opsional, untuk info saja)
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
# 4. Simpan Data
print("\n4. Menyimpan hasil...")
joblib.dump(X_train_selected, output_X_train)
joblib.dump(X_test_selected, output_X_test)
joblib.dump(selector, output_selector) # Penting untuk prediksi data baru nanti
joblib.dump(selector, output_selector)
print("="*40)
print(f"SUKSES! Data siap untuk Training XGBoost.")

View File

@ -4,58 +4,44 @@ from imblearn.over_sampling import SMOTE
from collections import Counter
import os
# ==========================================
# KONFIGURASI
# ==========================================
# Gunakan relative path agar aman (sama seperti sebelumnya)
base_dir = os.path.dirname(os.path.abspath(__file__))
# Input files (hasil dari TF-IDF sebelumnya)
input_X = 'X_train_tfidf.pkl'
input_y = 'y_train.pkl'
# Output files (hasil SMOTE)
output_X = 'new_X_train_smote.pkl'
output_y = 'new_y_train_smote.pkl'
print("--- MEMULAI PROSES SMOTE (Skenario 3) ---")
try:
# 1. Load Data TF-IDF (Data Latih Saja)
print("1. Memuat data latih TF-IDF...")
# Cek apakah file ada di folder yang sama atau perlu path khusus
if os.path.exists(os.path.join(base_dir, input_X)):
X_train = joblib.load(os.path.join(base_dir, input_X))
y_train = joblib.load(os.path.join(base_dir, input_y))
else:
# Fallback jika file ada di current directory
X_train = joblib.load(input_X)
y_train = joblib.load(input_y)
print(f" - Dimensi Awal: {X_train.shape}")
print(f" - Distribusi Kelas Awal: {Counter(y_train)}")
# Contoh output: {0: 1964, 1: 485, 2: 303} (tergantung mapping label encoder)
# 2. Eksekusi SMOTE
print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...")
print(" (Sedang membuat data sintetis untuk kelas minoritas...)")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# 3. Validasi Hasil
print("\n3. Validasi Hasil SMOTE:")
print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}")
print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}")
# Pastikan semua kelas jumlahnya sama
counts = list(Counter(y_train_resampled).values())
if len(set(counts)) == 1:
print(" ✅ SUCCESS: Dataset sekarang SEIMBANG!")
else:
print(" ⚠️ WARNING: Dataset belum seimbang sempurna.")
# 4. Simpan Data SMOTE
print("\n4. Menyimpan data hasil SMOTE...")
joblib.dump(X_train_resampled, output_X)
joblib.dump(y_train_resampled, output_y)

View File

@ -14,16 +14,11 @@ from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os
# Download NLTK resources (Cukup sekali run)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
class ReviewScraper:
def __init__(self):
options = Options()
# options.add_argument("--headless")
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option(
@ -43,7 +38,6 @@ class ReviewScraper:
def get_review_data(self, container, source_url) -> dict:
try:
# 1. Username
username = "Anonymous"
user_elem = container.find(
'span', attrs={'data-testid': 'proName'})
@ -53,7 +47,6 @@ class ReviewScraper:
if user_elem:
username = user_elem.text
# 2. Rating (Ambil dari aria-label bintang)
rating = "5"
rating_elem = container.find(
'div', attrs={'data-testid': 'icnStarRating'})
@ -63,7 +56,6 @@ class ReviewScraper:
except:
pass
# 3. Ulasan Text
ulasan = ""
ulasan_elem = container.find(
'span', attrs={'data-testid': 'lblItemUlasan'})
@ -72,7 +64,6 @@ class ReviewScraper:
if ulasan_elem:
ulasan = ulasan_elem.text
# Fallback jika ulasan kosong
if not ulasan:
paragraphs = container.find_all('p')
for p in paragraphs:
@ -80,7 +71,6 @@ class ReviewScraper:
ulasan = p.text
break
# 4. Tanggal
waktu_komentar = "Unknown"
date_elem = container.find(
'p', class_=re.compile(r'timestamp|date', re.I))
@ -92,7 +82,6 @@ class ReviewScraper:
waktu_komentar = span.text
break
# VALIDASI: Jangan simpan jika kosong
if not ulasan:
return None
@ -118,13 +107,10 @@ class ReviewScraper:
"""
print(f" ...Mencoba {action} filter Bintang {rating}...")
# Scroll agar elemen masuk viewport
self.driver.execute_script("window.scrollBy(0, 400);")
time.sleep(1)
# STRATEGI XPATH
strategies = [
# Spesifik Tokped baru
f"//label[contains(@for, 'rating') and .//text()='{rating}']",
f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]",
f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]",
@ -134,38 +120,31 @@ class ReviewScraper:
for attempt in range(max_retries):
found_element = None
# Coba cari elemen dengan salah satu strategi
for xpath in strategies:
try:
# Timeout dipendekkan ke 2 detik agar cepat skip jika tidak ada
found_element = WebDriverWait(self.driver, 2).until(
EC.presence_of_element_located((By.XPATH, xpath))
)
# Cek apakah visible & clickable
if found_element.is_displayed():
# Cek apakah disabled (kelas CSS atau atribut)
if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"):
print(
f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).")
return False
# Jika elemen ketemu, siap diklik
break
else:
found_element = None # Ketemu di DOM tapi hidden
found_element = None
except TimeoutException:
continue # Coba strategi xpath berikutnya
continue
# HASIL PENCARIAN
if found_element:
try:
# KLIK!
self.driver.execute_script(
"arguments[0].click();", found_element)
print(
f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!")
time.sleep(3) # Tunggu loading data
time.sleep(3)
return True
except Exception as click_error:
if attempt < max_retries - 1:
@ -178,8 +157,6 @@ class ReviewScraper:
f" [ERROR] Gagal klik filter setelah retry: {click_error}")
return False
else:
# PENTING: Jika di attempt pertama tidak ketemu di semua strategi,
# asumsikan filter TIDAK ADA. Jangan retry.
print(
f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.")
return False
@ -199,7 +176,6 @@ class ReviewScraper:
containers = soup.find_all("article")
if not containers:
# Double check: kadang loading lambat
time.sleep(2)
soup = BeautifulSoup(self.driver.page_source, "html.parser")
containers = soup.find_all(
@ -214,7 +190,6 @@ class ReviewScraper:
for container in containers:
review_data = self.get_review_data(container, url)
if review_data:
# Validasi Rating sesuai Filter
if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context:
continue
@ -229,12 +204,11 @@ class ReviewScraper:
else:
print(f" . Halaman {page_number} tidak ada data baru.")
empty_page_count += 1
if empty_page_count >= 2: # Stop jika 2 halaman berturut-turut zonk
if empty_page_count >= 2:
print(
" [STOP] 2 halaman tanpa data baru. Pindah filter.")
break
# Navigasi Next Button
try:
next_button = self.driver.find_element(
By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']")
@ -256,35 +230,27 @@ class ReviewScraper:
self.driver.execute_script("window.scrollBy(0, 800);")
time.sleep(2)
# TARGET: Negatif (1,2) & Netral (3)
target_filters = ['1', '2', '3']
for rating in target_filters:
# 1. KLIK FILTER
success = self.toggle_filter(rating, action="CHECK")
if success:
# 2. SCRAPE
self.scrape_pages_current_view(
url, current_rating_context=rating)
# 3. UNCHECK (PENTING: Gunakan logic toggle yang sama)
# Scroll dikit ke atas biar tombol filter kelihatan lagi
self.driver.execute_script("window.scrollBy(0, -300);")
time.sleep(1)
uncheck_success = self.toggle_filter(
rating, action="UNCHECK")
if not uncheck_success:
# Jika gagal uncheck, refresh page adalah jalan ninja
print(
" [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...")
self.driver.refresh()
time.sleep(4)
self.driver.execute_script("window.scrollBy(0, 800);")
else:
# Jika toggle CHECK gagal/tidak ketemu -> LANJUT ke rating berikutnya
# Tidak perlu scrape, tidak perlu uncheck
continue
time.sleep(1)
@ -306,23 +272,18 @@ class ReviewScraper:
self.driver.quit()
if self.data:
# 1. Siapkan Data Baru
df_new = pd.DataFrame(self.data)
df_new = self.label_data(df_new)
filename = 'new_dataset_fix_balanced.csv'
# 2. Cek apakah file sudah ada (Smart Merge Logic)
if os.path.exists(filename):
try:
print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...")
df_old = pd.read_csv(filename)
# Gabungkan data lama dan baru
df_combined = pd.concat([df_old, df_new], ignore_index=True)
# 3. Hapus Duplikat
# Kita anggap duplikat jika Username, Review (yang sudah dibersihkan), dan Tanggal sama persis
total_before = len(df_combined)
df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True)
total_after = len(df_combined)
@ -339,7 +300,6 @@ class ReviewScraper:
print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.")
df_final = df_new
# 4. Simpan Hasil Akhir
print("\n=== TOTAL DATASET SETELAH UPDATE ===")
print(df_final['Sentiment'].value_counts())

View File

@ -1,24 +1,17 @@
import pandas as pd
# 1. Load data
df = pd.read_csv('new_final_dataset.csv')
# 2. Pisahkan tiap kelas
df_pos = df[df['Sentiment'] == 'positif']
df_neg = df[df['Sentiment'] == 'negatif']
df_net = df[df['Sentiment'] == 'netral']
# 3. Hitung target (Jumlah Negatif + Netral)
target_count = len(df_neg) + len(df_net) # Hasilnya 1622
target_count = len(df_neg) + len(df_net)
# 4. Ambil sampel acak dari kelas positif sebanyak target_count
df_pos_trimmed = df_pos.sample(n=target_count, random_state=42)
# 5. Gabungkan kembali semua data
df_final = pd.concat([df_pos_trimmed, df_neg, df_net])
# 6. Acak urutan data agar tidak mengumpul
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
# Simpan hasil
df_final.to_csv('trimmed_sentiment_dataset.csv', index=False)

View File

@ -3,29 +3,22 @@ import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix
# Data Confusion Matrix dari Skenario 3 (Pipeline + SMOTE)
# Baris: Aktual, Kolom: Prediksi
data_cm = np.array([
[146, 34, 19], # Aktual Negatif
[60, 36, 28], # Aktual Netral
[29, 16, 280] # Aktual Positif
[146, 34, 19],
[60, 36, 28],
[29, 16, 280]
])
# Label kategori
labels = ['Negatif', 'Netral', 'Positif']
# Membuat plot
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2) # Mengatur ukuran font
sns.set(font_scale=1.2)
# Membuat heatmap
ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels)
# Menambahkan label dan judul
plt.xlabel('Prediksi', fontsize=14, labelpad=15)
plt.ylabel('Aktual', fontsize=14, labelpad=15)
plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20)
# Menampilkan plot
plt.show()

View File

@ -1,7 +1,6 @@
import matplotlib.pyplot as plt
import numpy as np
# Data dari hasil eksperimen Anda
scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)']
accuracy = [0.78, 0.79, 0.77]
macro_f1 = [0.65, 0.66, 0.66]
@ -12,12 +11,10 @@ width = 0.25
fig, ax = plt.subplots(figsize=(10, 6))
# Membuat bar chart
rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db')
rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71')
rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c')
# Menambahkan teks dan label
ax.set_ylabel('Scores')
ax.set_title('Perbandingan Performa Model XGBoost antar Skenario')
ax.set_xticks(x)
@ -25,7 +22,6 @@ ax.set_xticklabels(scenarios)
ax.legend(loc='lower right')
ax.set_ylim(0, 1.0)
# Menambahkan label angka di atas bar
def autolabel(rects):
for rect in rects:
height = rect.get_height()