chore: cleaning the code
This commit is contained in:
parent
817924bd8c
commit
3613b1a120
2
run.py
2
run.py
|
|
@ -3,10 +3,8 @@ import sys
|
|||
import uvicorn
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Paksa penggunaan SelectorEventLoop di level paling dasar OS Windows
|
||||
if sys.platform == 'win32':
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||
print("✅ Mesin Selector Loop Aktif (Anti-NotImplementedError)")
|
||||
|
||||
# Jalankan uvicorn dari sini, bukan dari terminal langsung
|
||||
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)
|
||||
|
|
@ -11,7 +11,6 @@ def clean_product_name(name: str) -> str:
|
|||
return name.strip()
|
||||
|
||||
async def process_product_reviews(candidate: ProductCandidate, user_email: str, metric_id: int, brand_id: int, request: Request):
|
||||
# 1. SETUP ASPEK (Initialize score 0 untuk setiap kategori)
|
||||
aspect_stats = {
|
||||
aspect: {"positive": 0, "total": 0}
|
||||
for aspect in config.ASPECT_KEYWORDS.keys()
|
||||
|
|
@ -19,7 +18,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
|||
|
||||
print(f"🔍 Memulai Analisis ABSA: {candidate.name[:30]}...")
|
||||
|
||||
# 2. DATABASE PRE-CHECK (Model & User)
|
||||
model_db = await prisma.model.find_first(where={"modelName": "Model XGBoost (Baseline)"})
|
||||
if not model_db:
|
||||
print("❌ ERROR: Model XGBoost tidak ditemukan di database!")
|
||||
|
|
@ -30,7 +28,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
|||
print(f"⚠️ User {user_email} tidak ditemukan!")
|
||||
return None
|
||||
|
||||
# 3. PRODUCT PERSISTENCE
|
||||
brand_name = clean_product_name(candidate.name.split()[0]) if candidate.name.strip() else "Unknown"
|
||||
product_name = clean_product_name(candidate.name)
|
||||
|
||||
|
|
@ -49,7 +46,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
|||
}
|
||||
)
|
||||
|
||||
# 4. NLP PREDICTION & ASPECT TAGGING LOOP
|
||||
total_reviews = len(candidate.reviews)
|
||||
if total_reviews == 0: return None
|
||||
|
||||
|
|
@ -69,7 +65,6 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
|||
pred_idx = ml_core.model_optimized.predict(vec)[0]
|
||||
label = ml_core.label_encoder.inverse_transform([pred_idx])[0].lower()
|
||||
|
||||
# Confidence Score dari XGBoost
|
||||
try:
|
||||
prob = ml_core.model_optimized.predict_proba(vec)[0]
|
||||
confidence_score = float(max(prob))
|
||||
|
|
@ -107,13 +102,11 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
|||
"userId": user_db.id
|
||||
})
|
||||
|
||||
# 5. DATABASE SYNC (Batch Operations)
|
||||
if reviews_data_to_save:
|
||||
async with prisma.tx() as transaction:
|
||||
await transaction.review.delete_many(where={"productId": product_db.productId})
|
||||
await transaction.review.create_many(data=reviews_data_to_save)
|
||||
|
||||
# 6. CALCULATION & VERDICT GENERATION
|
||||
final_aspect_scores = {}
|
||||
for aspect, stat in aspect_stats.items():
|
||||
score = (stat["positive"] / stat["total"] * 100) if stat["total"] > 0 else 0
|
||||
|
|
@ -140,14 +133,12 @@ async def process_product_reviews(candidate: ProductCandidate, user_email: str,
|
|||
else:
|
||||
verdict_label = "Kurang Disarankan"
|
||||
|
||||
# 1. Buat Analysis terlebih dahulu untuk mendapatkan ID-nya
|
||||
new_analysis = await prisma.analysis.create(
|
||||
data={
|
||||
"userId": user_db.id,
|
||||
}
|
||||
)
|
||||
|
||||
# 2. Buat Metric dan hubungkan ke Analysis yang baru saja dibuat
|
||||
await prisma.metric.create(
|
||||
data={
|
||||
"generalSentiment": general_sentiment_pct,
|
||||
|
|
|
|||
|
|
@ -5,9 +5,6 @@ from pathlib import Path
|
|||
from xgboost import XGBClassifier
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
|
||||
# ==========================================
|
||||
# 1. KONFIGURASI PATH (PATHLIB)
|
||||
# ==========================================
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||
DATA_DIR = PROJECT_ROOT / "robust_data"
|
||||
|
|
@ -20,9 +17,6 @@ PATHS = {
|
|||
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
|
||||
}
|
||||
|
||||
# ==========================================
|
||||
# 2. LOAD DATA
|
||||
# ==========================================
|
||||
print("\n--- MEMUAT DATA BASELINE ---")
|
||||
data = {}
|
||||
|
||||
|
|
@ -47,11 +41,6 @@ le = data['le']
|
|||
|
||||
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
|
||||
|
||||
# ==========================================
|
||||
# 3. SETUP MODEL BASELINE
|
||||
# ==========================================
|
||||
# Tanpa Grid Search, menggunakan settingan default XGBoost
|
||||
# Default XGBoost biasanya: learning_rate=0.3, max_depth=6, n_estimators=100
|
||||
model_baseline = XGBClassifier(
|
||||
objective='multi:softprob',
|
||||
num_class=3,
|
||||
|
|
@ -60,9 +49,6 @@ model_baseline = XGBClassifier(
|
|||
use_label_encoder=False
|
||||
)
|
||||
|
||||
# ==========================================
|
||||
# 4. EKSEKUSI TRAINING
|
||||
# ==========================================
|
||||
print("\n🔥 MULAI TRAINING BASELINE (SCENARIO 1)...")
|
||||
start_time = time.time()
|
||||
|
||||
|
|
@ -71,31 +57,21 @@ model_baseline.fit(X_train, y_train)
|
|||
duration = time.time() - start_time
|
||||
print(f"\n✅ SELESAI! Waktu proses: {duration:.2f} detik")
|
||||
|
||||
# ==========================================
|
||||
# 5. MENAMPILKAN PARAMETER DEFAULT (BARU)
|
||||
# ==========================================
|
||||
# Karena tidak pakai GridSearch, kita ambil parameter langsung dari modelnya
|
||||
print("\n" + "="*40)
|
||||
print("PARAMETER YANG DIGUNAKAN (DEFAULT)")
|
||||
print("="*40)
|
||||
|
||||
# Mengambil seluruh parameter model
|
||||
all_params = model_baseline.get_params()
|
||||
|
||||
# Kita filter hanya parameter penting untuk dibandingkan dengan Skenario 2 & 3
|
||||
key_params = ['learning_rate', 'max_depth', 'n_estimators', 'subsample', 'colsample_bytree']
|
||||
shown_params = {k: all_params.get(k) for k in key_params}
|
||||
|
||||
# Jika n_estimators atau learning_rate None (karena default library), kita set nilai standarnya manual untuk info
|
||||
if shown_params['n_estimators'] is None: shown_params['n_estimators'] = "100 (Default)"
|
||||
if shown_params['learning_rate'] is None: shown_params['learning_rate'] = "Default"
|
||||
|
||||
print(shown_params)
|
||||
print("(Gunakan nilai ini untuk perbandingan di Bab 4)")
|
||||
|
||||
# ==========================================
|
||||
# 6. EVALUASI & SIMPAN
|
||||
# ==========================================
|
||||
print("\n" + "="*40)
|
||||
print("HASIL SKENARIO 1 (BASELINE)")
|
||||
print("="*40)
|
||||
|
|
@ -111,7 +87,6 @@ print(classification_report(y_test_label, y_pred_label))
|
|||
print("\nConfusion Matrix:")
|
||||
print(confusion_matrix(y_test_label, y_pred_label))
|
||||
|
||||
# Simpan Model Baseline
|
||||
model_path = SCRIPT_DIR / 'new_xgboost_scenario1.pkl'
|
||||
joblib.dump(model_baseline, model_path)
|
||||
print(f"\n💾 Model baseline disimpan ke: {model_path}")
|
||||
|
|
@ -6,9 +6,6 @@ from xgboost import XGBClassifier
|
|||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
|
||||
# ==========================================
|
||||
# 1. KONFIGURASI PATH (PATHLIB)
|
||||
# ==========================================
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||
DATA_DIR = PROJECT_ROOT / "robust_data"
|
||||
|
|
@ -21,9 +18,6 @@ PATHS = {
|
|||
"le": DATA_DIR / "tokenize" / "label_encoder.pkl",
|
||||
}
|
||||
|
||||
# ==========================================
|
||||
# 2. LOAD DATA
|
||||
# ==========================================
|
||||
print("\n--- MEMUAT DATA SCENARIO 2 ---")
|
||||
data = {}
|
||||
|
||||
|
|
@ -46,11 +40,6 @@ le = data['le']
|
|||
|
||||
print(f"\nDimensi Training (Imbalanced): {X_train.shape}")
|
||||
|
||||
# ==========================================
|
||||
# 3. SETUP GRID SEARCH (SAMA DENGAN SKENARIO 3)
|
||||
# ==========================================
|
||||
# Kita gunakan range parameter yang SAMA PERSIS dengan Skenario 3
|
||||
# agar perbandingannya adil (apple-to-apple).
|
||||
param_grid = {
|
||||
'learning_rate': [0.01, 0.1, 0.2],
|
||||
'max_depth': [3, 5, 7],
|
||||
|
|
@ -67,7 +56,6 @@ xgb = XGBClassifier(
|
|||
use_label_encoder=False
|
||||
)
|
||||
|
||||
# Gunakan F1-Macro agar Grid Search mencoba adil ke kelas minoritas
|
||||
grid_search = GridSearchCV(
|
||||
estimator=xgb,
|
||||
param_grid=param_grid,
|
||||
|
|
@ -77,9 +65,6 @@ grid_search = GridSearchCV(
|
|||
verbose=1
|
||||
)
|
||||
|
||||
# ==========================================
|
||||
# 4. EKSEKUSI TRAINING
|
||||
# ==========================================
|
||||
print("\n🔥 MULAI TRAINING & GRID SEARCH (SCENARIO 2)...")
|
||||
print("Sedang mencari parameter terbaik untuk data Imbalanced...")
|
||||
start_time = time.time()
|
||||
|
|
@ -89,9 +74,6 @@ grid_search.fit(X_train, y_train)
|
|||
duration = time.time() - start_time
|
||||
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
|
||||
|
||||
# ==========================================
|
||||
# 5. EVALUASI & SIMPAN
|
||||
# ==========================================
|
||||
best_model = grid_search.best_estimator_
|
||||
|
||||
print("\n" + "="*40)
|
||||
|
|
@ -114,7 +96,6 @@ print(classification_report(y_test_label, y_pred_label))
|
|||
print("\nConfusion Matrix:")
|
||||
print(confusion_matrix(y_test_label, y_pred_label))
|
||||
|
||||
# Simpan Model Skenario 2
|
||||
model_path = SCRIPT_DIR / 'new_model_xgboost_scenario2.pkl'
|
||||
joblib.dump(best_model, model_path)
|
||||
print(f"\n💾 Model Skenario 2 disimpan ke: {model_path}")
|
||||
|
|
@ -11,9 +11,6 @@ from sklearn.metrics import classification_report, confusion_matrix
|
|||
from imblearn.pipeline import Pipeline as ImbPipeline
|
||||
from imblearn.over_sampling import SMOTE
|
||||
|
||||
# ==========================================
|
||||
# KONFIGURASI PATH
|
||||
# ==========================================
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||
DATA_DIR = PROJECT_ROOT / "robust_data"
|
||||
|
|
@ -28,7 +25,6 @@ PATHS = {
|
|||
|
||||
print("--- MENYIAPKAN TRAINING SCENARIO 3 (PIPELINE: SMOTE + CHI2 + XGBOOST) ---")
|
||||
|
||||
# Load Data
|
||||
data = {}
|
||||
for name, path in PATHS.items():
|
||||
if not path.exists():
|
||||
|
|
@ -41,9 +37,6 @@ X_train, y_train = data["X_train"], data["y_train"]
|
|||
X_test, y_test = data["X_test"], data["y_test"]
|
||||
le = data["le"]
|
||||
|
||||
# ==========================================
|
||||
# REPORT PROPORSI DATA (SEBELUM & SESUDAH SMOTE)
|
||||
# ==========================================
|
||||
print("\n" + "="*40)
|
||||
print("REPORT PROPORSI DATA")
|
||||
print("="*40)
|
||||
|
|
@ -57,16 +50,12 @@ def print_proportion(y, title):
|
|||
|
||||
print_proportion(y_train, "PROPORSI DATA AWAL (TRAIN)")
|
||||
|
||||
# Simulasi SMOTE untuk melihat hasil akhir yang akan diproses Pipeline
|
||||
sm_sim = SMOTE(random_state=42)
|
||||
_, y_resampled_sim = sm_sim.fit_resample(X_train, y_train)
|
||||
print_proportion(y_resampled_sim, "ESTIMASI PROPORSI SETELAH SMOTE (DALAM PIPELINE)")
|
||||
|
||||
print("\n" + "="*40)
|
||||
|
||||
# ==========================================
|
||||
# DEFINISI PIPELINE
|
||||
# ==========================================
|
||||
pipeline = ImbPipeline([
|
||||
('smote', SMOTE(random_state=42)),
|
||||
('selector', SelectKBest(score_func=chi2, k=2000)),
|
||||
|
|
@ -79,9 +68,6 @@ pipeline = ImbPipeline([
|
|||
))
|
||||
])
|
||||
|
||||
# ==========================================
|
||||
# SETTING GRID SEARCH
|
||||
# ==========================================
|
||||
# param_grid = {
|
||||
# 'clf__learning_rate': [0.1, 0.2],
|
||||
# 'clf__max_depth': [5, 7],
|
||||
|
|
@ -106,9 +92,6 @@ grid_search = GridSearchCV(
|
|||
verbose=2
|
||||
)
|
||||
|
||||
# ==========================================
|
||||
# EKSEKUSI TRAINING
|
||||
# ==========================================
|
||||
print(f"\n🔥 MULAI TRAINING... (Dimensi Awal: {X_train.shape})")
|
||||
start_time = time.time()
|
||||
|
||||
|
|
@ -117,9 +100,6 @@ grid_search.fit(X_train, y_train)
|
|||
duration = time.time() - start_time
|
||||
print(f"\n✅ SELESAI! Waktu proses: {duration/60:.2f} menit")
|
||||
|
||||
# ==========================================
|
||||
# EVALUASI
|
||||
# ==========================================
|
||||
best_model = grid_search.best_estimator_
|
||||
|
||||
print("\n" + "="*40)
|
||||
|
|
@ -129,7 +109,6 @@ print(grid_search.best_params_)
|
|||
|
||||
y_pred = best_model.predict(X_test)
|
||||
|
||||
# Inverse Transform Label
|
||||
y_test_label = le.inverse_transform(y_test)
|
||||
y_pred_label = le.inverse_transform(y_pred)
|
||||
|
||||
|
|
@ -139,9 +118,6 @@ print(classification_report(y_test_label, y_pred_label))
|
|||
print("\nConfusion Matrix:")
|
||||
print(confusion_matrix(y_test_label, y_pred_label))
|
||||
|
||||
# ==========================================
|
||||
# SIMPAN MODEL
|
||||
# ==========================================
|
||||
MODEL_DIR = PROJECT_ROOT / "models"
|
||||
MODEL_DIR.mkdir(exist_ok=True)
|
||||
model_path = MODEL_DIR / "xgboost_scenario3.pkl"
|
||||
|
|
|
|||
|
|
@ -2,82 +2,58 @@ import joblib
|
|||
import os
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
|
||||
# ==========================================
|
||||
# KONFIGURASI
|
||||
# ==========================================
|
||||
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
# Input (Kita butuh Data Train hasil SMOTE dan Data Test asli)
|
||||
input_X_train = 'new_X_train_smote.pkl'
|
||||
input_y_train = 'new_y_train_smote.pkl'
|
||||
input_X_test = 'X_test_tfidf.pkl' # Test set asli (belum diapa-apakan selain TFIDF)
|
||||
input_X_test = 'X_test_tfidf.pkl'
|
||||
|
||||
# Output
|
||||
# output_X_train = 'data/chi2/X_train_chi2.pkl'
|
||||
# output_X_test = 'data/chi2/X_test_chi2.pkl'
|
||||
# output_selector = 'data/chi2/chisquare_selector.pkl' # Simpan logikanya
|
||||
output_X_train = 'X_train_chi2.pkl'
|
||||
output_X_test = 'X_test_chi2.pkl'
|
||||
output_selector = 'chisquare_selector.pkl' # Simpan logikanya
|
||||
output_selector = 'chisquare_selector.pkl'
|
||||
|
||||
# JUMLAH FITUR YANG INGIN DIAMBIL (Parameter K)
|
||||
# Silakan ubah angka ini. 1000 adalah angka start yang bagus untuk Skripsi S1.
|
||||
# Jika fitur awal Anda < 1000, ubah jadi 'all' atau angka lebih kecil (misal 500).
|
||||
K_FEATURES = 1000
|
||||
|
||||
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
|
||||
|
||||
try:
|
||||
# 1. Load Data
|
||||
print("1. Memuat data...")
|
||||
# Load Train (SMOTE)
|
||||
X_train = joblib.load(os.path.join(base_dir, input_X_train))
|
||||
y_train = joblib.load(os.path.join(base_dir, input_y_train))
|
||||
|
||||
# Load Test (TF-IDF Asli)
|
||||
# Kita butuh ini agar dimensi Test sama dengan Train nanti
|
||||
X_test = joblib.load(os.path.join(base_dir, input_X_test))
|
||||
|
||||
print(f" - Dimensi Awal Train: {X_train.shape}")
|
||||
print(f" - Dimensi Awal Test: {X_test.shape}")
|
||||
|
||||
# Cek jumlah fitur total
|
||||
total_features = X_train.shape[1]
|
||||
print(f" - Total kata/fitur saat ini: {total_features}")
|
||||
|
||||
# Validasi K
|
||||
if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
|
||||
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
|
||||
k_final = 'all'
|
||||
else:
|
||||
k_final = K_FEATURES
|
||||
|
||||
# 2. Proses Chi-Square
|
||||
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
|
||||
|
||||
# Inisialisasi SelectKBest dengan skor func chi2
|
||||
selector = SelectKBest(score_func=chi2, k=k_final)
|
||||
|
||||
# FIT hanya pada Data Train! (Pelajari mana kata penting dari data latih)
|
||||
selector.fit(X_train, y_train)
|
||||
|
||||
# TRANSFORM pada Train DAN Test
|
||||
X_train_selected = selector.transform(X_train)
|
||||
X_test_selected = selector.transform(X_test)
|
||||
|
||||
# 3. Validasi Hasil
|
||||
print("\n3. Hasil Seleksi:")
|
||||
print(f" - Dimensi Train Baru: {X_train_selected.shape}")
|
||||
print(f" - Dimensi Test Baru: {X_test_selected.shape}")
|
||||
|
||||
# Menampilkan beberapa skor fitur (opsional, untuk info saja)
|
||||
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
|
||||
|
||||
# 4. Simpan Data
|
||||
print("\n4. Menyimpan hasil...")
|
||||
joblib.dump(X_train_selected, output_X_train)
|
||||
joblib.dump(X_test_selected, output_X_test)
|
||||
joblib.dump(selector, output_selector) # Penting untuk prediksi data baru nanti
|
||||
joblib.dump(selector, output_selector)
|
||||
|
||||
print("="*40)
|
||||
print(f"SUKSES! Data siap untuk Training XGBoost.")
|
||||
|
|
|
|||
|
|
@ -4,58 +4,44 @@ from imblearn.over_sampling import SMOTE
|
|||
from collections import Counter
|
||||
import os
|
||||
|
||||
# ==========================================
|
||||
# KONFIGURASI
|
||||
# ==========================================
|
||||
# Gunakan relative path agar aman (sama seperti sebelumnya)
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Input files (hasil dari TF-IDF sebelumnya)
|
||||
input_X = 'X_train_tfidf.pkl'
|
||||
input_y = 'y_train.pkl'
|
||||
|
||||
# Output files (hasil SMOTE)
|
||||
output_X = 'new_X_train_smote.pkl'
|
||||
output_y = 'new_y_train_smote.pkl'
|
||||
|
||||
print("--- MEMULAI PROSES SMOTE (Skenario 3) ---")
|
||||
|
||||
try:
|
||||
# 1. Load Data TF-IDF (Data Latih Saja)
|
||||
print("1. Memuat data latih TF-IDF...")
|
||||
# Cek apakah file ada di folder yang sama atau perlu path khusus
|
||||
if os.path.exists(os.path.join(base_dir, input_X)):
|
||||
X_train = joblib.load(os.path.join(base_dir, input_X))
|
||||
y_train = joblib.load(os.path.join(base_dir, input_y))
|
||||
else:
|
||||
# Fallback jika file ada di current directory
|
||||
X_train = joblib.load(input_X)
|
||||
y_train = joblib.load(input_y)
|
||||
|
||||
print(f" - Dimensi Awal: {X_train.shape}")
|
||||
print(f" - Distribusi Kelas Awal: {Counter(y_train)}")
|
||||
# Contoh output: {0: 1964, 1: 485, 2: 303} (tergantung mapping label encoder)
|
||||
|
||||
# 2. Eksekusi SMOTE
|
||||
print("\n2. Menjalankan SMOTE (Synthetic Minority Over-sampling)...")
|
||||
print(" (Sedang membuat data sintetis untuk kelas minoritas...)")
|
||||
|
||||
smote = SMOTE(random_state=42)
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
||||
|
||||
# 3. Validasi Hasil
|
||||
print("\n3. Validasi Hasil SMOTE:")
|
||||
print(f" - Dimensi Setelah SMOTE: {X_train_resampled.shape}")
|
||||
print(f" - Distribusi Kelas Baru: {Counter(y_train_resampled)}")
|
||||
|
||||
# Pastikan semua kelas jumlahnya sama
|
||||
counts = list(Counter(y_train_resampled).values())
|
||||
if len(set(counts)) == 1:
|
||||
print(" ✅ SUCCESS: Dataset sekarang SEIMBANG!")
|
||||
else:
|
||||
print(" ⚠️ WARNING: Dataset belum seimbang sempurna.")
|
||||
|
||||
# 4. Simpan Data SMOTE
|
||||
print("\n4. Menyimpan data hasil SMOTE...")
|
||||
joblib.dump(X_train_resampled, output_X)
|
||||
joblib.dump(y_train_resampled, output_y)
|
||||
|
|
|
|||
|
|
@ -14,16 +14,11 @@ from nltk.corpus import stopwords
|
|||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
import os
|
||||
# Download NLTK resources (Cukup sekali run)
|
||||
# nltk.download('punkt')
|
||||
# nltk.download('stopwords')
|
||||
# nltk.download('wordnet')
|
||||
|
||||
|
||||
class ReviewScraper:
|
||||
def __init__(self):
|
||||
options = Options()
|
||||
# options.add_argument("--headless")
|
||||
options.add_argument("--start-maximized")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
options.add_experimental_option(
|
||||
|
|
@ -43,7 +38,6 @@ class ReviewScraper:
|
|||
|
||||
def get_review_data(self, container, source_url) -> dict:
|
||||
try:
|
||||
# 1. Username
|
||||
username = "Anonymous"
|
||||
user_elem = container.find(
|
||||
'span', attrs={'data-testid': 'proName'})
|
||||
|
|
@ -53,7 +47,6 @@ class ReviewScraper:
|
|||
if user_elem:
|
||||
username = user_elem.text
|
||||
|
||||
# 2. Rating (Ambil dari aria-label bintang)
|
||||
rating = "5"
|
||||
rating_elem = container.find(
|
||||
'div', attrs={'data-testid': 'icnStarRating'})
|
||||
|
|
@ -63,7 +56,6 @@ class ReviewScraper:
|
|||
except:
|
||||
pass
|
||||
|
||||
# 3. Ulasan Text
|
||||
ulasan = ""
|
||||
ulasan_elem = container.find(
|
||||
'span', attrs={'data-testid': 'lblItemUlasan'})
|
||||
|
|
@ -72,7 +64,6 @@ class ReviewScraper:
|
|||
if ulasan_elem:
|
||||
ulasan = ulasan_elem.text
|
||||
|
||||
# Fallback jika ulasan kosong
|
||||
if not ulasan:
|
||||
paragraphs = container.find_all('p')
|
||||
for p in paragraphs:
|
||||
|
|
@ -80,7 +71,6 @@ class ReviewScraper:
|
|||
ulasan = p.text
|
||||
break
|
||||
|
||||
# 4. Tanggal
|
||||
waktu_komentar = "Unknown"
|
||||
date_elem = container.find(
|
||||
'p', class_=re.compile(r'timestamp|date', re.I))
|
||||
|
|
@ -92,7 +82,6 @@ class ReviewScraper:
|
|||
waktu_komentar = span.text
|
||||
break
|
||||
|
||||
# VALIDASI: Jangan simpan jika kosong
|
||||
if not ulasan:
|
||||
return None
|
||||
|
||||
|
|
@ -118,13 +107,10 @@ class ReviewScraper:
|
|||
"""
|
||||
print(f" ...Mencoba {action} filter Bintang {rating}...")
|
||||
|
||||
# Scroll agar elemen masuk viewport
|
||||
self.driver.execute_script("window.scrollBy(0, 400);")
|
||||
time.sleep(1)
|
||||
|
||||
# STRATEGI XPATH
|
||||
strategies = [
|
||||
# Spesifik Tokped baru
|
||||
f"//label[contains(@for, 'rating') and .//text()='{rating}']",
|
||||
f"//label[.//text()='{rating}' and .//*[name()='img' or name()='svg']]",
|
||||
f"//*[text()='Rating']/ancestor::div[2]//label[contains(., '{rating}')]",
|
||||
|
|
@ -134,38 +120,31 @@ class ReviewScraper:
|
|||
for attempt in range(max_retries):
|
||||
found_element = None
|
||||
|
||||
# Coba cari elemen dengan salah satu strategi
|
||||
for xpath in strategies:
|
||||
try:
|
||||
# Timeout dipendekkan ke 2 detik agar cepat skip jika tidak ada
|
||||
found_element = WebDriverWait(self.driver, 2).until(
|
||||
EC.presence_of_element_located((By.XPATH, xpath))
|
||||
)
|
||||
|
||||
# Cek apakah visible & clickable
|
||||
if found_element.is_displayed():
|
||||
# Cek apakah disabled (kelas CSS atau atribut)
|
||||
if "disabled" in found_element.get_attribute("class") or found_element.get_attribute("disabled"):
|
||||
print(
|
||||
f" [SKIP] Filter Bintang {rating} ada tapi DISABLED (Non-aktif).")
|
||||
return False
|
||||
|
||||
# Jika elemen ketemu, siap diklik
|
||||
break
|
||||
else:
|
||||
found_element = None # Ketemu di DOM tapi hidden
|
||||
found_element = None
|
||||
except TimeoutException:
|
||||
continue # Coba strategi xpath berikutnya
|
||||
continue
|
||||
|
||||
# HASIL PENCARIAN
|
||||
if found_element:
|
||||
try:
|
||||
# KLIK!
|
||||
self.driver.execute_script(
|
||||
"arguments[0].click();", found_element)
|
||||
print(
|
||||
f" [SUKSES] Filter Bintang {rating} berhasil di-{action}!")
|
||||
time.sleep(3) # Tunggu loading data
|
||||
time.sleep(3)
|
||||
return True
|
||||
except Exception as click_error:
|
||||
if attempt < max_retries - 1:
|
||||
|
|
@ -178,8 +157,6 @@ class ReviewScraper:
|
|||
f" [ERROR] Gagal klik filter setelah retry: {click_error}")
|
||||
return False
|
||||
else:
|
||||
# PENTING: Jika di attempt pertama tidak ketemu di semua strategi,
|
||||
# asumsikan filter TIDAK ADA. Jangan retry.
|
||||
print(
|
||||
f" [SKIP] Filter Bintang {rating} TIDAK DITEMUKAN (Mungkin 0 ulasan). Lanjut.")
|
||||
return False
|
||||
|
|
@ -199,7 +176,6 @@ class ReviewScraper:
|
|||
containers = soup.find_all("article")
|
||||
|
||||
if not containers:
|
||||
# Double check: kadang loading lambat
|
||||
time.sleep(2)
|
||||
soup = BeautifulSoup(self.driver.page_source, "html.parser")
|
||||
containers = soup.find_all(
|
||||
|
|
@ -214,7 +190,6 @@ class ReviewScraper:
|
|||
for container in containers:
|
||||
review_data = self.get_review_data(container, url)
|
||||
if review_data:
|
||||
# Validasi Rating sesuai Filter
|
||||
if current_rating_context != "ALL" and review_data['Rating'] != current_rating_context:
|
||||
continue
|
||||
|
||||
|
|
@ -229,12 +204,11 @@ class ReviewScraper:
|
|||
else:
|
||||
print(f" . Halaman {page_number} tidak ada data baru.")
|
||||
empty_page_count += 1
|
||||
if empty_page_count >= 2: # Stop jika 2 halaman berturut-turut zonk
|
||||
if empty_page_count >= 2:
|
||||
print(
|
||||
" [STOP] 2 halaman tanpa data baru. Pindah filter.")
|
||||
break
|
||||
|
||||
# Navigasi Next Button
|
||||
try:
|
||||
next_button = self.driver.find_element(
|
||||
By.CSS_SELECTOR, "button[aria-label^='Laman berikutnya']")
|
||||
|
|
@ -256,35 +230,27 @@ class ReviewScraper:
|
|||
self.driver.execute_script("window.scrollBy(0, 800);")
|
||||
time.sleep(2)
|
||||
|
||||
# TARGET: Negatif (1,2) & Netral (3)
|
||||
target_filters = ['1', '2', '3']
|
||||
|
||||
for rating in target_filters:
|
||||
# 1. KLIK FILTER
|
||||
success = self.toggle_filter(rating, action="CHECK")
|
||||
|
||||
if success:
|
||||
# 2. SCRAPE
|
||||
self.scrape_pages_current_view(
|
||||
url, current_rating_context=rating)
|
||||
|
||||
# 3. UNCHECK (PENTING: Gunakan logic toggle yang sama)
|
||||
# Scroll dikit ke atas biar tombol filter kelihatan lagi
|
||||
self.driver.execute_script("window.scrollBy(0, -300);")
|
||||
time.sleep(1)
|
||||
|
||||
uncheck_success = self.toggle_filter(
|
||||
rating, action="UNCHECK")
|
||||
if not uncheck_success:
|
||||
# Jika gagal uncheck, refresh page adalah jalan ninja
|
||||
print(
|
||||
" [REFRESH] Gagal uncheck, refresh halaman untuk reset filter...")
|
||||
self.driver.refresh()
|
||||
time.sleep(4)
|
||||
self.driver.execute_script("window.scrollBy(0, 800);")
|
||||
else:
|
||||
# Jika toggle CHECK gagal/tidak ketemu -> LANJUT ke rating berikutnya
|
||||
# Tidak perlu scrape, tidak perlu uncheck
|
||||
continue
|
||||
|
||||
time.sleep(1)
|
||||
|
|
@ -306,23 +272,18 @@ class ReviewScraper:
|
|||
self.driver.quit()
|
||||
|
||||
if self.data:
|
||||
# 1. Siapkan Data Baru
|
||||
df_new = pd.DataFrame(self.data)
|
||||
df_new = self.label_data(df_new)
|
||||
|
||||
filename = 'new_dataset_fix_balanced.csv'
|
||||
|
||||
# 2. Cek apakah file sudah ada (Smart Merge Logic)
|
||||
if os.path.exists(filename):
|
||||
try:
|
||||
print(f"\n[INFO] File '{filename}' ditemukan. Membaca data lama...")
|
||||
df_old = pd.read_csv(filename)
|
||||
|
||||
# Gabungkan data lama dan baru
|
||||
df_combined = pd.concat([df_old, df_new], ignore_index=True)
|
||||
|
||||
# 3. Hapus Duplikat
|
||||
# Kita anggap duplikat jika Username, Review (yang sudah dibersihkan), dan Tanggal sama persis
|
||||
total_before = len(df_combined)
|
||||
df_combined.drop_duplicates(subset=['Username', 'Cleaned_Review', 'Date'], keep='first', inplace=True)
|
||||
total_after = len(df_combined)
|
||||
|
|
@ -339,7 +300,6 @@ class ReviewScraper:
|
|||
print(f"\n[INFO] File '{filename}' belum ada. Membuat file baru.")
|
||||
df_final = df_new
|
||||
|
||||
# 4. Simpan Hasil Akhir
|
||||
print("\n=== TOTAL DATASET SETELAH UPDATE ===")
|
||||
print(df_final['Sentiment'].value_counts())
|
||||
|
||||
|
|
|
|||
|
|
@ -1,24 +1,17 @@
|
|||
import pandas as pd
|
||||
|
||||
# 1. Load data
|
||||
df = pd.read_csv('new_final_dataset.csv')
|
||||
|
||||
# 2. Pisahkan tiap kelas
|
||||
df_pos = df[df['Sentiment'] == 'positif']
|
||||
df_neg = df[df['Sentiment'] == 'negatif']
|
||||
df_net = df[df['Sentiment'] == 'netral']
|
||||
|
||||
# 3. Hitung target (Jumlah Negatif + Netral)
|
||||
target_count = len(df_neg) + len(df_net) # Hasilnya 1622
|
||||
target_count = len(df_neg) + len(df_net)
|
||||
|
||||
# 4. Ambil sampel acak dari kelas positif sebanyak target_count
|
||||
df_pos_trimmed = df_pos.sample(n=target_count, random_state=42)
|
||||
|
||||
# 5. Gabungkan kembali semua data
|
||||
df_final = pd.concat([df_pos_trimmed, df_neg, df_net])
|
||||
|
||||
# 6. Acak urutan data agar tidak mengumpul
|
||||
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||
|
||||
# Simpan hasil
|
||||
df_final.to_csv('trimmed_sentiment_dataset.csv', index=False)
|
||||
|
|
@ -3,29 +3,22 @@ import seaborn as sns
|
|||
import numpy as np
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
# Data Confusion Matrix dari Skenario 3 (Pipeline + SMOTE)
|
||||
# Baris: Aktual, Kolom: Prediksi
|
||||
data_cm = np.array([
|
||||
[146, 34, 19], # Aktual Negatif
|
||||
[60, 36, 28], # Aktual Netral
|
||||
[29, 16, 280] # Aktual Positif
|
||||
[146, 34, 19],
|
||||
[60, 36, 28],
|
||||
[29, 16, 280]
|
||||
])
|
||||
|
||||
# Label kategori
|
||||
labels = ['Negatif', 'Netral', 'Positif']
|
||||
|
||||
# Membuat plot
|
||||
plt.figure(figsize=(8, 6))
|
||||
sns.set(font_scale=1.2) # Mengatur ukuran font
|
||||
sns.set(font_scale=1.2)
|
||||
|
||||
# Membuat heatmap
|
||||
ax = sns.heatmap(data_cm, annot=True, fmt='d', cmap='Blues',
|
||||
xticklabels=labels, yticklabels=labels)
|
||||
|
||||
# Menambahkan label dan judul
|
||||
plt.xlabel('Prediksi', fontsize=14, labelpad=15)
|
||||
plt.ylabel('Aktual', fontsize=14, labelpad=15)
|
||||
plt.title('Confusion Matrix Skenario 1 (Baseline)', fontsize=16, pad=20)
|
||||
|
||||
# Menampilkan plot
|
||||
plt.show()
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# Data dari hasil eksperimen Anda
|
||||
scenarios = ['Skenario 1\n(Baseline)', 'Skenario 2\n(Tuned)', 'Skenario 3\n(Full Optimized)']
|
||||
accuracy = [0.78, 0.79, 0.77]
|
||||
macro_f1 = [0.65, 0.66, 0.66]
|
||||
|
|
@ -12,12 +11,10 @@ width = 0.25
|
|||
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
# Membuat bar chart
|
||||
rects1 = ax.bar(x - width, accuracy, width, label='Accuracy', color='#3498db')
|
||||
rects2 = ax.bar(x, macro_f1, width, label='Macro Avg F1-Score', color='#2ecc71')
|
||||
rects3 = ax.bar(x + width, recall_netral, width, label='Recall Netral', color='#e74c3c')
|
||||
|
||||
# Menambahkan teks dan label
|
||||
ax.set_ylabel('Scores')
|
||||
ax.set_title('Perbandingan Performa Model XGBoost antar Skenario')
|
||||
ax.set_xticks(x)
|
||||
|
|
@ -25,7 +22,6 @@ ax.set_xticklabels(scenarios)
|
|||
ax.legend(loc='lower right')
|
||||
ax.set_ylim(0, 1.0)
|
||||
|
||||
# Menambahkan label angka di atas bar
|
||||
def autolabel(rects):
|
||||
for rect in rects:
|
||||
height = rect.get_height()
|
||||
|
|
|
|||
Loading…
Reference in New Issue