chore: cleaning the code

This commit is contained in:
Mahen 2026-04-10 11:26:03 +07:00
parent 817924bd8c
commit 3613b1a120
11 changed files with 12 additions and 187 deletions

2
run.py
View File

@ -3,10 +3,8 @@ import sys
import uvicorn import uvicorn
if __name__ == "__main__": if __name__ == "__main__":
# Paksa penggunaan SelectorEventLoop di level paling dasar OS Windows
if sys.platform == 'win32': if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
print("✅ Mesin Selector Loop Aktif (Anti-NotImplementedError)") print("✅ Mesin Selector Loop Aktif (Anti-NotImplementedError)")
# Jalankan uvicorn dari sini, bukan dari terminal langsung
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True) uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)

View File

@ -11,7 +11,6 @@ def clean_product_name(name: str) -> str:
return name.strip() return name.strip()
async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request): async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request):
# 1. SETUP ASPEK (Initialize score 0 untuk setiap kategori)
aspect_stats = { aspect_stats = {
aspect: {"positive": 0, "total": 0} aspect: {"positive": 0, "total": 0}
for aspect in config.ASPECT_KEYWORDS.keys() for aspect in config.ASPECT_KEYWORDS.keys()
@ -19,7 +18,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
print(f"🔍 Memulai Analisis ABSA: {candidate.name[:30]}...") print(f"🔍 Memulai Analisis ABSA: {candidate.name[:30]}...")
# 2. DATABASE PRE-CHECK (Model & User)
model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"}) model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"})
if not model_db: if not model_db:
print("❌ ERROR: Model XGBoost tidak ditemukan di database!") print("❌ ERROR: Model XGBoost tidak ditemukan di database!")
@ -30,7 +28,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
print(f"⚠️ User {user_email} tidak ditemukan!") print(f"⚠️ User {user_email} tidak ditemukan!")
return None return None
# 3. PRODUCT PERSISTENCE
brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown" brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown"
product_name = clean_product_name(candidate.name) product_name = clean_product_name(candidate.name)
@ -49,7 +46,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
} }
) )
# 4. NLP PREDICTION & ASPECT TAGGING LOOP
total_reviews = len(candidate.reviews) total_reviews = len(candidate.reviews)
if total_reviews == 0: return None if total_reviews == 0: return None
@ -69,7 +65,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
pred_idx = ml_core.model_optimized.predict(vec)[0] pred_idx = ml_core.model_optimized.predict(vec)[0]
label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower() label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower()
# Confidence Score dari XGBoost
try: try:
prob = ml_core.model_optimized.predict_proba(vec)[0] prob = ml_core.model_optimized.predict_proba(vec)[0]
confidence_score = float(max(prob)) confidence_score = float(max(prob))
@ -107,13 +102,11 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
"userId": user_db.id "userId": user_db.id
}) })
# 5. DATABASE SYNC (Batch Operations)
if reviews_data_to_save: if reviews_data_to_save:
async with prisma.tx() as transaction: async with prisma.tx() as transaction:
await transaction.review.delete_many(where={"productId": product_db.productId}) await transaction.review.delete_many(where={"productId": product_db.productId})
await transaction.review.create_many(data=reviews_data_to_save) await transaction.review.create_many(data=reviews_data_to_save)
# 6. CALCULATION & VERDICT GENERATION
final_aspect_scores = {} final_aspect_scores = {}
for aspect, stat in aspect_stats.items(): for aspect, stat in aspect_stats.items():
score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0 score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0
@ -140,14 +133,12 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
else: else:
verdict_label = "Kurang Disarankan" verdict_label = "Kurang Disarankan"
# 1. Buat Analysis terlebih dahulu untuk mendapatkan ID-nya
new_analysis = await prisma.analysis.create( new_analysis = await prisma.analysis.create(
data={ data={
"userId": user_db.id, "userId": user_db.id,
} }
) )
# 2. Buat Metric dan hubungkan ke Analysis yang baru saja dibuat
await prisma.metric.create( await prisma.metric.create(
data={ data={
"generalSentiment": general_sentiment_pct, "generalSentiment": general_sentiment_pct,

View File

@ -5,9 +5,6 @@ from pathlib import Path
from xgboost import XGBClassifier from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix from sklearn.metrics import classification_report, confusion_matrix
# ==========================================
# 1. KONFIGURASI PATH (PATHLIB)
# ==========================================
SCRIPT_DIR = Path(__file__).resolve().parent SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1] PROJECT_ROOT = SCRIPT_DIR.parents[1]
DATA_DIR = PROJECT_ROOT / "robust_data" DATA_DIR = PROJECT_ROOT / "robust_data"
@ -20,9 +17,6 @@ PATHS = {
"le": DATA_DIR / "tokenize" / "label_encoder.pkl", "le": DATA_DIR / "tokenize" / "label_encoder.pkl",
} }
# ==========================================
# 2. LOAD DATA
# ==========================================
print("\n--- MEMUAT DATA BASELINE ---") print("\n--- MEMUAT DATA BASELINE ---")
data = {} data = {}
@ -47,11 +41,6 @@ le = data['le']
print(f"\nDimensi Training (Imbalanced): {X_train.shape}") print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
# ==========================================
# 3. SETUP MODEL BASELINE
# ==========================================
# Tanpa Grid Search, menggunakan settingan default XGBoost
# Default XGBoost biasanya: learning_rate=0.3, max_depth=6, n_estimators=100
model_baseline = XGBClassifier( model_baseline = XGBClassifier(
objective='multi:softprob', objective='multi:softprob',
num_class=3, num_class=3,
@ -60,9 +49,6 @@ model_baseline = XGBClassifier(
use_label_encoder=False use_label_encoder=False
) )
# ==========================================
# 4. EKSEKUSI TRAINING
# ==========================================
print("\n🔥 MULAI TRAINING BASELINE (SCENARIO 1)...") print("\n🔥 MULAI TRAINING BASELINE (SCENARIO 1)...")
start_time = time.time() start_time = time.time()
@ -71,31 +57,21 @@ model_baseline.fit(X_train, y_train)
duration = time.time() - start_time duration = time.time() - start_time
print(f"\n✅ SELESAI! Waktu proses: {duration:.2f} detik") print(f"\n✅ SELESAI! Waktu proses: {duration:.2f} detik")
# ==========================================
# 5. MENAMPILKAN PARAMETER DEFAULT (BARU)
# ==========================================
# Karena tidak pakai GridSearch, kita ambil parameter langsung dari modelnya
print("\n" + "="*40) print("\n" + "="*40)
print("PARAMETER YANG DIGUNAKAN (DEFAULT)") print("PARAMETER YANG DIGUNAKAN (DEFAULT)")
print("="*40) print("="*40)
# Mengambil seluruh parameter model
all_params = model_baseline.get_params() all_params = model_baseline.get_params()
# Kita filter hanya parameter penting untuk dibandingkan dengan Skenario 2 & 3
key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree'] key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree']
shown_params = {k: all_params.get(k) for k in key_params} shown_params = {k: all_params.get(k) for k in key_params}
# Jika n_estimators atau learning_rate None (karena default library), kita set nilai standarnya manual untuk info
if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)" if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)"
if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default" if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default"
print(shown_params) print(shown_params)
print("(Gunakan nilai ini untuk perbandingan di Bab 4)") print("(Gunakan nilai ini untuk perbandingan di Bab 4)")
# ==========================================
# 6. EVALUASI & SIMPAN
# ==========================================
print("\n" + "="*40) print("\n" + "="*40)
print("HASIL SKENARIO 1 (BASELINE)") print("HASIL SKENARIO 1 (BASELINE)")
print("="*40) print("="*40)
@ -111,7 +87,6 @@ print(classification_report(y_test_label, y_pred_label))
print("\nConfusion Matrix:") print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label, y_pred_label)) print(confusion_matrix(y_test_label, y_pred_label))
# Simpan Model Baseline
model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl' model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl'
joblib.dump(model_baseline, model_path) joblib.dump(model_baseline, model_path)
print(f"\n💾 Model baseline disimpan ke: {model_path}") print(f"\n💾 Model baseline disimpan ke: {model_path}")

View File

@ -6,9 +6,6 @@ from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix from sklearn.metrics import classification_report, confusion_matrix
# ==========================================
# 1. KONFIGURASI PATH (PATHLIB)
# ==========================================
SCRIPT_DIR = Path(__file__).resolve().parent SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1] PROJECT_ROOT = SCRIPT_DIR.parents[1]
DATA_DIR = PROJECT_ROOT / "robust_data" DATA_DIR = PROJECT_ROOT / "robust_data"
@ -21,9 +18,6 @@ PATHS = {
"le": DATA_DIR / "tokenize" / "label_encoder.pkl", "le": DATA_DIR / "tokenize" / "label_encoder.pkl",
} }
# ==========================================
# 2. LOAD DATA
# ==========================================
print("\n--- MEMUAT DATA SCENARIO 2 ---") print("\n--- MEMUAT DATA SCENARIO 2 ---")
data = {} data = {}
@ -46,11 +40,6 @@ le = data['le']
print(f"\nDimensi Training (Imbalanced): {X_train.shape}") print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
# ==========================================
# 3. SETUP GRID SEARCH (SAMA DENGAN SKENARIO 3)
# ==========================================
# Kita gunakan range parameter yang SAMA PERSIS dengan Skenario 3
# agar perbandingannya adil (apple-to-apple).
param_grid = { param_grid = {
'learning_rate': [0.01, 0.1, 0.2], 'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7], 'max_depth': [3, 5, 7],
@ -67,7 +56,6 @@ xgb = XGBClassifier(
use_label_encoder=False use_label_encoder=False
) )
# Gunakan F1-Macro agar Grid Search mencoba adil ke kelas minoritas
grid_search = GridSearchCV( grid_search = GridSearchCV(
estimator=xgb, estimator=xgb,
param_grid=param_grid, param_grid=param_grid,
@ -77,9 +65,6 @@ grid_search = GridSearchCV(
verbose=1 verbose=1
) )
# ==========================================
# 4. EKSEKUSI TRAINING
# ==========================================
print("\n🔥 MULAI TRAINING & GRID SEARCH (SCENARIO 2)...") print("\n🔥 MULAI TRAINING & GRID SEARCH (SCENARIO 2)...")
print("Sedang mencari parameter terbaik untuk data Imbalanced...") print("Sedang mencari parameter terbaik untuk data Imbalanced...")
start_time = time.time() start_time = time.time()
@ -89,9 +74,6 @@ grid_search.fit(X_train, y_train)
duration = time.time() - start_time duration = time.time() - start_time
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit") print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
# ==========================================
# 5. EVALUASI & SIMPAN
# ==========================================
best_model = grid_search.best_estimator_ best_model = grid_search.best_estimator_
print("\n" + "="*40) print("\n" + "="*40)
@ -114,7 +96,6 @@ print(classification_report(y_test_label, y_pred_label))
print("\nConfusion Matrix:") print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label, y_pred_label)) print(confusion_matrix(y_test_label, y_pred_label))
# Simpan Model Skenario 2
model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl' model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl'
joblib.dump(best_model, model_path) joblib.dump(best_model, model_path)
print(f"\n💾 Model Skenario 2 disimpan ke: {model_path}") print(f"\n💾 Model Skenario 2 disimpan ke: {model_path}")

View File

@ -11,9 +11,6 @@ from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE from imblearn.over_sampling import SMOTE
# ==========================================
# KONFIGURASI PATH
# ==========================================
SCRIPT_DIR = Path(__file__).resolve().parent SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1] PROJECT_ROOT = SCRIPT_DIR.parents[1]
DATA_DIR = PROJECT_ROOT / "robust_data" DATA_DIR = PROJECT_ROOT / "robust_data"
@ -28,7 +25,6 @@ PATHS = {
print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---") print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---")
# Load Data
data = {} data = {}
for name, path in PATHS.items(): for name, path in PATHS.items():
if not path.exists(): if not path.exists():
@ -41,9 +37,6 @@ X_train, y_train = data["X_train"], data["y_train"]
X_test, y_test = data["X_test"], data["y_test"] X_test, y_test = data["X_test"], data["y_test"]
le = data["le"] le = data["le"]
# ==========================================
# REPORT PROPORSI DATA (SEBELUM & SESUDAH SMOTE)
# ==========================================
print("\n" + "="*40) print("\n" + "="*40)
print("REPORT PROPORSI DATA") print("REPORT PROPORSI DATA")
print("="*40) print("="*40)
@ -57,16 +50,12 @@ def print_proportion(y, title):
print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)") print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)")
# Simulasi SMOTE untuk melihat hasil akhir yang akan diproses Pipeline
sm_sim = SMOTE(random_state=42) sm_sim = SMOTE(random_state=42)
_, y_resampled_sim = sm_sim.fit_resample(X_train, y_train) _, y_resampled_sim = sm_sim.fit_resample(X_train, y_train)
print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)") print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)")
print("\n" + "="*40) print("\n" + "="*40)
# ==========================================
# DEFINISI PIPELINE
# ==========================================
pipeline = ImbPipeline([ pipeline = ImbPipeline([
('smote', SMOTE(random_state=42)), ('smote', SMOTE(random_state=42)),
('selector', SelectKBest(score_func=chi2, k=2000)), ('selector', SelectKBest(score_func=chi2, k=2000)),
@ -79,9 +68,6 @@ pipeline = ImbPipeline([
)) ))
]) ])
# ==========================================
# SETTING GRID SEARCH
# ==========================================
# param_grid = { # param_grid = {
# 'clf__learning_rate': [0.1, 0.2], # 'clf__learning_rate': [0.1, 0.2],
# 'clf__max_depth': [5, 7], # 'clf__max_depth': [5, 7],
@ -106,9 +92,6 @@ grid_search = GridSearchCV(
verbose=2 verbose=2
) )
# ==========================================
# EKSEKUSI TRAINING
# ==========================================
print(f"\n🔥 MULAI TRAINING... (Dimensi Awal: {X_train.shape})") print(f"\n🔥 MULAI TRAINING... (Dimensi Awal: {X_train.shape})")
start_time = time.time() start_time = time.time()
@ -117,9 +100,6 @@ grid_search.fit(X_train, y_train)
duration = time.time() - start_time duration = time.time() - start_time
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit") print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
# ==========================================
# EVALUASI
# ==========================================
best_model = grid_search.best_estimator_ best_model = grid_search.best_estimator_
print("\n" + "="*40) print("\n" + "="*40)
@ -129,7 +109,6 @@ print(grid_search.best_params_)
y_pred = best_model.predict(X_test) y_pred = best_model.predict(X_test)
# Inverse Transform Label
y_test_label = le.inverse_transform(y_test) y_test_label = le.inverse_transform(y_test)
y_pred_label = le.inverse_transform(y_pred) y_pred_label = le.inverse_transform(y_pred)
@ -139,9 +118,6 @@ print(classification_report(y_test_label, y_pred_label))
print("\nConfusion Matrix:") print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label, y_pred_label)) print(confusion_matrix(y_test_label, y_pred_label))
# ==========================================
# SIMPAN MODEL
# ==========================================
MODEL_DIR = PROJECT_ROOT / "models" MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True) MODEL_DIR.mkdir(exist_ok=True)
model_path = MODEL_DIR / "xgboost_scenario3.pkl" model_path = MODEL_DIR / "xgboost_scenario3.pkl"

View File

@ -2,82 +2,58 @@ import joblib
import os import os
from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_selection import SelectKBest, chi2
# ==========================================
# KONFIGURASI
# ==========================================
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
# Input (Kita butuh Data Train hasil SMOTE dan Data Test asli)
input_X_train = 'new_X_train_smote.pkl' input_X_train = 'new_X_train_smote.pkl'
input_y_train = 'new_y_train_smote.pkl' input_y_train = 'new_y_train_smote.pkl'
input_X_test = 'X_test_tfidf.pkl' # Test set asli (belum diapa-apakan selain TFIDF) input_X_test = 'X_test_tfidf.pkl'
# Output
# output_X_train = 'data/chi2/X_train_chi2.pkl'
# output_X_test = 'data/chi2/X_test_chi2.pkl'
# output_selector = 'data/chi2/chisquare_selector.pkl' # Simpan logikanya
output_X_train = 'X_train_chi2.pkl' output_X_train = 'X_train_chi2.pkl'
output_X_test = 'X_test_chi2.pkl' output_X_test = 'X_test_chi2.pkl'
output_selector = 'chisquare_selector.pkl' # Simpan logikanya output_selector = 'chisquare_selector.pkl'
# JUMLAH FITUR YANG INGIN DIAMBIL (Parameter K)
# Silakan ubah angka ini. 1000 adalah angka start yang bagus untuk Skripsi S1.
# Jika fitur awal Anda < 1000, ubah jadi 'all' atau angka lebih kecil (misal 500).
K_FEATURES = 1000 K_FEATURES = 1000
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---") print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
try: try:
# 1. Load Data
print("1. Memuat data...") print("1. Memuat data...")
# Load Train (SMOTE)
X_train = joblib.load(os.path.join(base_dir, input_X_train)) X_train = joblib.load(os.path.join(base_dir, input_X_train))
y_train = joblib.load(os.path.join(base_dir, input_y_train)) y_train = joblib.load(os.path.join(base_dir, input_y_train))
# Load Test (TF-IDF Asli)
# Kita butuh ini agar dimensi Test sama dengan Train nanti
X_test = joblib.load(os.path.join(base_dir, input_X_test)) X_test = joblib.load(os.path.join(base_dir, input_X_test))
print(f" - Dimensi Awal Train: {X_train.shape}") print(f" - Dimensi Awal Train: {X_train.shape}")
print(f" - Dimensi Awal Test: {X_test.shape}") print(f" - Dimensi Awal Test: {X_test.shape}")
# Cek jumlah fitur total
total_features = X_train.shape[1] total_features = X_train.shape[1]
print(f" - Total kata/fitur saat ini: {total_features}") print(f" - Total kata/fitur saat ini: {total_features}")
# Validasi K
if isinstance(K_FEATURES, int) and K_FEATURES > total_features: if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.") print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
k_final = 'all' k_final = 'all'
else: else:
k_final = K_FEATURES k_final = K_FEATURES
# 2. Proses Chi-Square
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...") print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
# Inisialisasi SelectKBest dengan skor func chi2
selector = SelectKBest(score_func=chi2, k=k_final) selector = SelectKBest(score_func=chi2, k=k_final)
# FIT hanya pada Data Train! (Pelajari mana kata penting dari data latih)
selector.fit(X_train, y_train) selector.fit(X_train, y_train)
# TRANSFORM pada Train DAN Test
X_train_selected = selector.transform(X_train) X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test) X_test_selected = selector.transform(X_test)
# 3. Validasi Hasil
print("\n3. Hasil Seleksi:") print("\n3. Hasil Seleksi:")
print(f" - Dimensi Train Baru: {X_train_selected.shape}") print(f" - Dimensi Train Baru: {X_train_selected.shape}")
print(f" - Dimensi Test Baru: {X_test_selected.shape}") print(f" - Dimensi Test Baru: {X_test_selected.shape}")
# Menampilkan beberapa skor fitur (opsional, untuk info saja)
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.") print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
# 4. Simpan Data
print("\n4. Menyimpan hasil...") print("\n4. Menyimpan hasil...")
joblib.dump(X_train_selected, output_X_train) joblib.dump(X_train_selected, output_X_train)
joblib.dump(X_test_selected, output_X_test) joblib.dump(X_test_selected, output_X_test)
joblib.dump(selector, output_selector) # Penting untuk prediksi data baru nanti joblib.dump(selector, output_selector)
print("="*40) print("="*40)
print(f"SUKSES! Data siap untuk Training XGBoost.") print(f"SUKSES! Data siap untuk Training XGBoost.")

View File

@ -4,58 +4,44 @@ from imblearn.over_sampling import SMOTE
from collections import Counter from collections import Counter
import os import os
# ==========================================
# KONFIGURASI
# ==========================================
# Gunakan relative path agar aman (sama seperti sebelumnya)
base_dir = os.path.dirname(os.path.abspath(__file__)) base_dir = os.path.dirname(os.path.abspath(__file__))
# Input files (hasil dari TF-IDF sebelumnya)
input_X = 'X_train_tfidf.pkl' input_X = 'X_train_tfidf.pkl'
input_y = 'y_train.pkl' input_y = 'y_train.pkl'
# Output files (hasil SMOTE)
output_X = 'new_X_train_smote.pkl' output_X = 'new_X_train_smote.pkl'
output_y = 'new_y_train_smote.pkl' output_y = 'new_y_train_smote.pkl'
print("--- MEMULAI PROSES SMOTE (Skenario 3) ---") print("--- MEMULAI PROSES SMOTE (Skenario 3) ---")
try: try:
# 1. Load Data TF-IDF (Data Latih Saja)
print("1. Memuat data latih TF-IDF...") print("1. Memuat data latih TF-IDF...")
# Cek apakah file ada di folder yang sama atau perlu path khusus
if os.path.exists(os.path.join(base_dir, input_X)): if os.path.exists(os.path.join(base_dir, input_X)):
X_train = joblib.load(os.path.join(base_dir, input_X)) X_train = joblib.load(os.path.join(base_dir, input_X))
y_train = joblib.load(os.path.join(base_dir, input_y)) y_train = joblib.load(os.path.join(base_dir, input_y))
else: else:
# Fallback jika file ada di current directory
X_train = joblib.load(input_X) X_train = joblib.load(input_X)
y_train = joblib.load(input_y) y_train = joblib.load(input_y)
print(f" - Dimensi Awal: {X_train.shape}") print(f" - Dimensi Awal: {X_train.shape}")
print(f" - Distribusi Kelas Awal: {Counter(y_train)}") print(f" - Distribusi Kelas Awal: {Counter(y_train)}")
# Contoh output: {0: 1964, 1: 485, 2: 303} (tergantung mapping label encoder)
# 2. Eksekusi SMOTE
print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...") print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...")
print(" (Sedang membuat data sintetis untuk kelas minoritas...)") print(" (Sedang membuat data sintetis untuk kelas minoritas...)")
smote = SMOTE(random_state=42) smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# 3. Validasi Hasil
print("\n3. Validasi Hasil SMOTE:") print("\n3. Validasi Hasil SMOTE:")
print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}") print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}")
print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}") print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}")
# Pastikan semua kelas jumlahnya sama
counts = list(Counter(y_train_resampled).values()) counts = list(Counter(y_train_resampled).values())
if len(set(counts)) == 1: if len(set(counts)) == 1:
print(" ✅ SUCCESS: Dataset sekarang SEIMBANG!") print(" ✅ SUCCESS: Dataset sekarang SEIMBANG!")
else: else:
print(" ⚠️ WARNING: Dataset belum seimbang sempurna.") print(" ⚠️ WARNING: Dataset belum seimbang sempurna.")
# 4. Simpan Data SMOTE
print("\n4. Menyimpan data hasil SMOTE...") print("\n4. Menyimpan data hasil SMOTE...")
joblib.dump(X_train_resampled, output_X) joblib.dump(X_train_resampled, output_X)
joblib.dump(y_train_resampled, output_y) joblib.dump(y_train_resampled, output_y)

View File

@ -14,16 +14,11 @@ from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
import os import os
# Download NLTK resources (Cukup sekali run)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
class ReviewScraper: class ReviewScraper:
def __init__(self): def __init__(self):
options = Options() options = Options()
# options.add_argument("--headless")
options.add_argument("--start-maximized") options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option( options.add_experimental_option(
@ -43,7 +38,6 @@ class ReviewScraper:
def get_review_data(self, container, source_url) -> dict: def get_review_data(self, container, source_url) -> dict:
try: try:
# 1. Username
username = "Anonymous" username = "Anonymous"
user_elem = container.find( user_elem = container.find(
'span', attrs={'data-testid': 'proName'}) 'span', attrs={'data-testid': 'proName'})
@ -53,7 +47,6 @@ class ReviewScraper:
if user_elem: if user_elem:
username = user_elem.text username = user_elem.text
# 2. Rating (Ambil dari aria-label bintang)
rating = "5" rating = "5"
rating_elem = container.find( rating_elem = container.find(
'div', attrs={'data-testid': 'icnStarRating'}) 'div', attrs={'data-testid': 'icnStarRating'})
@ -63,7 +56,6 @@ class ReviewScraper:
except: except:
pass pass
# 3. Ulasan Text
ulasan = "" ulasan = ""
ulasan_elem = container.find( ulasan_elem = container.find(
'span', attrs={'data-testid': 'lblItemUlasan'}) 'span', attrs={'data-testid': 'lblItemUlasan'})
@ -72,7 +64,6 @@ class ReviewScraper:
if ulasan_elem: if ulasan_elem:
ulasan = ulasan_elem.text ulasan = ulasan_elem.text
# Fallback jika ulasan kosong
if not ulasan: if not ulasan:
paragraphs = container.find_all('p') paragraphs = container.find_all('p')
for p in paragraphs: for p in paragraphs:
@ -80,7 +71,6 @@ class ReviewScraper:
ulasan = p.text ulasan = p.text
break break
# 4. Tanggal
waktu_komentar = "Unknown" waktu_komentar = "Unknown"
date_elem = container.find( date_elem = container.find(
'p', class_=re.compile(r'timestamp|date', re.I)) 'p', class_=re.compile(r'timestamp|date', re.I))
@ -92,7 +82,6 @@ class ReviewScraper:
waktu_komentar = span.text waktu_komentar = span.text
break break
# VALIDASI: Jangan simpan jika kosong
if not ulasan: if not ulasan:
return None return None
@ -118,13 +107,10 @@ class ReviewScraper:
""" """
print(f" ...Mencoba {action} filter Bintang {rating}...") print(f" ...Mencoba {action} filter Bintang {rating}...")
# Scroll agar elemen masuk viewport
self.driver.execute_script("window.scrollBy(0, 400);") self.driver.execute_script("window.scrollBy(0, 400);")
time.sleep(1) time.sleep(1)
# STRATEGI XPATH
strategies = [ strategies = [
# Spesifik Tokped baru
f"//label[contains(@for, 'rating') and .//text()='{rating}']", f"//label[contains(@for, 'rating') and .//text()='{rating}']",
f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]", f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]",
f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]", f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]",
@ -134,38 +120,31 @@ class ReviewScraper:
for attempt in range(max_retries): for attempt in range(max_retries):
found_element = None found_element = None
# Coba cari elemen dengan salah satu strategi
for xpath in strategies: for xpath in strategies:
try: try:
# Timeout dipendekkan ke 2 detik agar cepat skip jika tidak ada
found_element = WebDriverWait(self.driver, 2).until( found_element = WebDriverWait(self.driver, 2).until(
EC.presence_of_element_located((By.XPATH, xpath)) EC.presence_of_element_located((By.XPATH, xpath))
) )
# Cek apakah visible & clickable
if found_element.is_displayed(): if found_element.is_displayed():
# Cek apakah disabled (kelas CSS atau atribut)
if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"): if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"):
print( print(
f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).") f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).")
return False return False
# Jika elemen ketemu, siap diklik
break break
else: else:
found_element = None # Ketemu di DOM tapi hidden found_element = None
except TimeoutException: except TimeoutException:
continue # Coba strategi xpath berikutnya continue
# HASIL PENCARIAN
if found_element: if found_element:
try: try:
# KLIK!
self.driver.execute_script( self.driver.execute_script(
"arguments[0].click();", found_element) "arguments[0].click();", found_element)
print( print(
f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!") f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!")
time.sleep(3) # Tunggu loading data time.sleep(3)
return True return True
except Exception as click_error: except Exception as click_error:
if attempt < max_retries - 1: if attempt < max_retries - 1:
@ -178,8 +157,6 @@ class ReviewScraper:
f" [ERROR] Gagal klik filter setelah retry: {click_error}") f" [ERROR] Gagal klik filter setelah retry: {click_error}")
return False return False
else: else:
# PENTING: Jika di attempt pertama tidak ketemu di semua strategi,
# asumsikan filter TIDAK ADA. Jangan retry.
print( print(
f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.") f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.")
return False return False
@ -199,7 +176,6 @@ class ReviewScraper:
containers = soup.find_all("article") containers = soup.find_all("article")
if not containers: if not containers:
# Double check: kadang loading lambat
time.sleep(2) time.sleep(2)
soup = BeautifulSoup(self.driver.page_source, "html.parser") soup = BeautifulSoup(self.driver.page_source, "html.parser")
containers = soup.find_all( containers = soup.find_all(
@ -214,7 +190,6 @@ class ReviewScraper:
for container in containers: for container in containers:
review_data = self.get_review_data(container, url) review_data = self.get_review_data(container, url)
if review_data: if review_data:
# Validasi Rating sesuai Filter
if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context: if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context:
continue continue
@ -229,12 +204,11 @@ class ReviewScraper:
else: else:
print(f" . Halaman {page_number} tidak ada data baru.") print(f" . Halaman {page_number} tidak ada data baru.")
empty_page_count += 1 empty_page_count += 1
if empty_page_count >= 2: # Stop jika 2 halaman berturut-turut zonk if empty_page_count >= 2:
print( print(
" [STOP] 2 halaman tanpa data baru. Pindah filter.") " [STOP] 2 halaman tanpa data baru. Pindah filter.")
break break
# Navigasi Next Button
try: try:
next_button = self.driver.find_element( next_button = self.driver.find_element(
By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']") By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']")
@ -256,35 +230,27 @@ class ReviewScraper:
self.driver.execute_script("window.scrollBy(0, 800);") self.driver.execute_script("window.scrollBy(0, 800);")
time.sleep(2) time.sleep(2)
# TARGET: Negatif (1,2) & Netral (3)
target_filters = ['1', '2', '3'] target_filters = ['1', '2', '3']
for rating in target_filters: for rating in target_filters:
# 1. KLIK FILTER
success = self.toggle_filter(rating, action="CHECK") success = self.toggle_filter(rating, action="CHECK")
if success: if success:
# 2. SCRAPE
self.scrape_pages_current_view( self.scrape_pages_current_view(
url, current_rating_context=rating) url, current_rating_context=rating)
# 3. UNCHECK (PENTING: Gunakan logic toggle yang sama)
# Scroll dikit ke atas biar tombol filter kelihatan lagi
self.driver.execute_script("window.scrollBy(0, -300);") self.driver.execute_script("window.scrollBy(0, -300);")
time.sleep(1) time.sleep(1)
uncheck_success = self.toggle_filter( uncheck_success = self.toggle_filter(
rating, action="UNCHECK") rating, action="UNCHECK")
if not uncheck_success: if not uncheck_success:
# Jika gagal uncheck, refresh page adalah jalan ninja
print( print(
" [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...") " [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...")
self.driver.refresh() self.driver.refresh()
time.sleep(4) time.sleep(4)
self.driver.execute_script("window.scrollBy(0, 800);") self.driver.execute_script("window.scrollBy(0, 800);")
else: else:
# Jika toggle CHECK gagal/tidak ketemu -> LANJUT ke rating berikutnya
# Tidak perlu scrape, tidak perlu uncheck
continue continue
time.sleep(1) time.sleep(1)
@ -306,23 +272,18 @@ class ReviewScraper:
self.driver.quit() self.driver.quit()
if self.data: if self.data:
# 1. Siapkan Data Baru
df_new = pd.DataFrame(self.data) df_new = pd.DataFrame(self.data)
df_new = self.label_data(df_new) df_new = self.label_data(df_new)
filename = 'new_dataset_fix_balanced.csv' filename = 'new_dataset_fix_balanced.csv'
# 2. Cek apakah file sudah ada (Smart Merge Logic)
if os.path.exists(filename): if os.path.exists(filename):
try: try:
print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...") print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...")
df_old = pd.read_csv(filename) df_old = pd.read_csv(filename)
# Gabungkan data lama dan baru
df_combined = pd.concat([df_old, df_new], ignore_index=True) df_combined = pd.concat([df_old, df_new], ignore_index=True)
# 3. Hapus Duplikat
# Kita anggap duplikat jika Username, Review (yang sudah dibersihkan), dan Tanggal sama persis
total_before = len(df_combined) total_before = len(df_combined)
df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True) df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True)
total_after = len(df_combined) total_after = len(df_combined)
@ -339,7 +300,6 @@ class ReviewScraper:
print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.") print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.")
df_final = df_new df_final = df_new
# 4. Simpan Hasil Akhir
print("\n=== TOTAL DATASET SETELAH UPDATE ===") print("\n=== TOTAL DATASET SETELAH UPDATE ===")
print(df_final['Sentiment'].value_counts()) print(df_final['Sentiment'].value_counts())

View File

@ -1,24 +1,17 @@
import pandas as pd import pandas as pd
# 1. Load data
df = pd.read_csv('new_final_dataset.csv') df = pd.read_csv('new_final_dataset.csv')
# 2. Pisahkan tiap kelas
df_pos = df[df['Sentiment'] == 'positif'] df_pos = df[df['Sentiment'] == 'positif']
df_neg = df[df['Sentiment'] == 'negatif'] df_neg = df[df['Sentiment'] == 'negatif']
df_net = df[df['Sentiment'] == 'netral'] df_net = df[df['Sentiment'] == 'netral']
# 3. Hitung target (Jumlah Negatif + Netral) target_count = len(df_neg) + len(df_net)
target_count = len(df_neg) + len(df_net) # Hasilnya 1622
# 4. Ambil sampel acak dari kelas positif sebanyak target_count
df_pos_trimmed = df_pos.sample(n=target_count, random_state=42) df_pos_trimmed = df_pos.sample(n=target_count, random_state=42)
# 5. Gabungkan kembali semua data
df_final = pd.concat([df_pos_trimmed, df_neg, df_net]) df_final = pd.concat([df_pos_trimmed, df_neg, df_net])
# 6. Acak urutan data agar tidak mengumpul
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True) df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
# Simpan hasil
df_final.to_csv('trimmed_sentiment_dataset.csv', index=False) df_final.to_csv('trimmed_sentiment_dataset.csv', index=False)

View File

@ -3,29 +3,22 @@ import seaborn as sns
import numpy as np import numpy as np
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
# Data Confusion Matrix dari Skenario 3 (Pipeline + SMOTE)
# Baris: Aktual, Kolom: Prediksi
data_cm = np.array([ data_cm = np.array([
[146, 34, 19], # Aktual Negatif [146, 34, 19],
[60, 36, 28], # Aktual Netral [60, 36, 28],
[29, 16, 280] # Aktual Positif [29, 16, 280]
]) ])
# Label kategori
labels = ['Negatif', 'Netral', 'Positif'] labels = ['Negatif', 'Netral', 'Positif']
# Membuat plot
plt.figure(figsize=(8, 6)) plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2) # Mengatur ukuran font sns.set(font_scale=1.2)
# Membuat heatmap
ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues', ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues',
xticklabels=labels, yticklabels=labels) xticklabels=labels, yticklabels=labels)
# Menambahkan label dan judul
plt.xlabel('Prediksi', fontsize=14, labelpad=15) plt.xlabel('Prediksi', fontsize=14, labelpad=15)
plt.ylabel('Aktual', fontsize=14, labelpad=15) plt.ylabel('Aktual', fontsize=14, labelpad=15)
plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20) plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20)
# Menampilkan plot
plt.show() plt.show()

View File

@ -1,7 +1,6 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
# Data dari hasil eksperimen Anda
scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)'] scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)']
accuracy = [0.78, 0.79, 0.77] accuracy = [0.78, 0.79, 0.77]
macro_f1 = [0.65, 0.66, 0.66] macro_f1 = [0.65, 0.66, 0.66]
@ -12,12 +11,10 @@ width = 0.25
fig, ax = plt.subplots(figsize=(10, 6)) fig, ax = plt.subplots(figsize=(10, 6))
# Membuat bar chart
rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db') rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db')
rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71') rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71')
rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c') rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c')
# Menambahkan teks dan label
ax.set_ylabel('Scores') ax.set_ylabel('Scores')
ax.set_title('Perbandingan Performa Model XGBoost antar Skenario') ax.set_title('Perbandingan Performa Model XGBoost antar Skenario')
ax.set_xticks(x) ax.set_xticks(x)
@ -25,7 +22,6 @@ ax.set_xticklabels(scenarios)
ax.legend(loc='lower right') ax.legend(loc='lower right')
ax.set_ylim(0, 1.0) ax.set_ylim(0, 1.0)
# Menambahkan label angka di atas bar
def autolabel(rects): def autolabel(rects):
for rect in rects: for rect in rects:
height = rect.get_height() height = rect.get_height()