177 lines
7.3 KiB
Python
177 lines
7.3 KiB
Python
"""
|
|
STEP 2 — SMOTE + XGBoost + Grid Search CV
|
|
==========================================
|
|
Urutan yang benar:
|
|
Data masuk → SMOTE (dalam fold CV) → XGBoost
|
|
|
|
Mengapa SMOTE di DALAM pipeline CV (bukan di luar)?
|
|
- Jika SMOTE dijalankan di luar CV (sekali sebelum fit), sampel sintetis
|
|
dari data validasi bisa "bocor" ke data train setiap fold → metrik CV
|
|
terlalu optimis, tidak merepresentasikan performa pada data baru.
|
|
- Dengan SMOTE di dalam pipeline, tiap fold:
|
|
1. Data train fold di-resample dengan SMOTE
|
|
2. Data validasi fold TIDAK disentuh SMOTE
|
|
→ evaluasi CV lebih jujur dan reliable.
|
|
|
|
Chi-Square sudah selesai di step1 (fit pada distribusi asli),
|
|
jadi di sini kita TIDAK perlu SelectKBest lagi.
|
|
|
|
Input : output dari step1_feature_selection.py
|
|
Output : best_model_step2.pkl, grid_search_results.csv
|
|
"""
|
|
|
|
import joblib
|
|
import sys
|
|
import time
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from xgboost import XGBClassifier
|
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
|
from sklearn.metrics import classification_report, confusion_matrix
|
|
from imblearn.pipeline import Pipeline as ImbPipeline
|
|
from imblearn.over_sampling import SMOTE
|
|
|
|
# ── Path konfigurasi ───────────────────────────────────────────────────────────
|
|
SCRIPT_DIR = Path("robust_data")
|
|
SEL_DIR = SCRIPT_DIR/ "selected"
|
|
MODEL_DIR = SCRIPT_DIR / "models"
|
|
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
PATHS = {
|
|
"X_train": SEL_DIR / "X_train_selected.pkl",
|
|
"y_train": SEL_DIR / "y_train.pkl",
|
|
"X_test": SEL_DIR / "X_test_selected.pkl",
|
|
"y_test": SEL_DIR / "y_test.pkl",
|
|
"le": SEL_DIR / "label_encoder.pkl",
|
|
}
|
|
|
|
# ── Muat data hasil step1 ──────────────────────────────────────────────────────
|
|
print("=" * 55)
|
|
print("STEP 2 — SMOTE + XGBOOST + GRID SEARCH CV")
|
|
print("=" * 55)
|
|
|
|
data = {}
|
|
for name, path in PATHS.items():
|
|
if not path.exists():
|
|
print(f"❌ File tidak ditemukan: {path}")
|
|
print(" Pastikan step1_feature_selection.py sudah dijalankan.")
|
|
sys.exit(1)
|
|
data[name] = joblib.load(path)
|
|
print(f"✅ Loaded: {path.name}")
|
|
|
|
X_train = data["X_train"]
|
|
y_train = data["y_train"]
|
|
X_test = data["X_test"]
|
|
y_test = data["y_test"]
|
|
le = data["le"]
|
|
|
|
print(f"\nDimensi X_train (post chi2): {X_train.shape}")
|
|
print(f"Dimensi X_test (post chi2): {X_test.shape}")
|
|
|
|
# ── Proporsi kelas ─────────────────────────────────────────────────────────────
|
|
def print_proportion(y, title, le):
|
|
unique, counts = np.unique(y, return_counts=True)
|
|
print(f"\n[{title}]")
|
|
for u, c in zip(unique, counts):
|
|
label = str(le.inverse_transform([u])[0]) # Pastikan label dicetak sebagai string
|
|
print(f" {label:10}: {c:6,} sampel ({c/len(y)*100:.2f}%)")
|
|
|
|
print_proportion(y_train, "PROPORSI TRAIN (sebelum SMOTE)", le)
|
|
|
|
# ── Pipeline: SMOTE → XGBoost ──────────────────────────────────────────────────
|
|
pipeline = ImbPipeline([
|
|
("smote", SMOTE(random_state=42)),
|
|
("clf", XGBClassifier(
|
|
objective="multi:softprob",
|
|
num_class=len(np.unique(y_train)),
|
|
random_state=42,
|
|
eval_metric="mlogloss",
|
|
use_label_encoder=False,
|
|
tree_method="hist", # lebih cepat untuk data besar
|
|
device="cpu",
|
|
)),
|
|
])
|
|
|
|
param_grid = {
|
|
"clf__learning_rate" : [0.01, 0.1, 0.2],
|
|
"clf__max_depth" : [3, 5, 7],
|
|
"clf__n_estimators" : [100, 200],
|
|
"clf__subsample" : [0.8, 1.0],
|
|
"clf__colsample_bytree": [0.8, 1.0],
|
|
}
|
|
|
|
total_combinations = 1
|
|
for v in param_grid.values():
|
|
total_combinations *= len(v)
|
|
print(f"\nTotal kombinasi parameter : {total_combinations}")
|
|
print(f"CV folds : 5")
|
|
print(f"Total fit : {total_combinations * 5}")
|
|
|
|
# ── StratifiedKFold — jaga proporsi kelas di setiap fold ──────────────────────
|
|
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
|
|
grid_search = GridSearchCV(
|
|
estimator=pipeline,
|
|
param_grid=param_grid,
|
|
cv=cv,
|
|
scoring="f1_macro", # metrik utama untuk data imbalanced multi-kelas
|
|
n_jobs=-1,
|
|
verbose=2,
|
|
refit=True, # otomatis latih ulang dengan param terbaik
|
|
return_train_score=True,
|
|
)
|
|
|
|
# ── Training ───────────────────────────────────────────────────────────────────
|
|
print(f"\n🔥 MULAI GRID SEARCH... (dimensi input: {X_train.shape})")
|
|
start_time = time.time()
|
|
|
|
grid_search.fit(X_train, y_train)
|
|
|
|
duration = time.time() - start_time
|
|
print(f"\n✅ SELESAI! Waktu: {duration/60:.2f} menit")
|
|
|
|
# ── Hasil parameter terbaik ────────────────────────────────────────────────────
|
|
print("\n" + "=" * 55)
|
|
print("PARAMETER TERBAIK")
|
|
print("=" * 55)
|
|
for k, v in grid_search.best_params_.items():
|
|
print(f" {k:35}: {v}")
|
|
print(f"\n F1-macro CV (terbaik) : {grid_search.best_score_:.4f}")
|
|
|
|
# ── Simpan semua hasil CV ke CSV ───────────────────────────────────────────────
|
|
cv_results_df = pd.DataFrame(grid_search.cv_results_)
|
|
cv_results_df = cv_results_df.sort_values("rank_test_score")
|
|
cv_path = MODEL_DIR / "grid_search_results.csv"
|
|
cv_results_df.to_csv(cv_path, index=False)
|
|
print(f"\n📊 Hasil semua kombinasi CV disimpan: {cv_path}")
|
|
|
|
# ── Evaluasi pada data test ────────────────────────────────────────────────────
|
|
print("\n" + "=" * 55)
|
|
print("EVALUASI PADA DATA TEST")
|
|
print("=" * 55)
|
|
|
|
best_model = grid_search.best_estimator_
|
|
y_pred = best_model.predict(X_test)
|
|
|
|
# Mencegah error TypeError '<' saat membandingkan string dan float/NaN
|
|
# dengan mengubah seluruh array menjadi tipe data string
|
|
y_test_label = le.inverse_transform(y_test).astype(str)
|
|
y_pred_label = le.inverse_transform(y_pred).astype(str)
|
|
str_classes = le.classes_.astype(str)
|
|
|
|
print("\nClassification Report:")
|
|
print(classification_report(y_test_label, y_pred_label, labels=str_classes))
|
|
|
|
print("Confusion Matrix:")
|
|
cm = confusion_matrix(y_test_label, y_pred_label, labels=str_classes)
|
|
cm_df = pd.DataFrame(cm,
|
|
index=[f"Aktual: {c}" for c in str_classes],
|
|
columns=[f"Pred: {c}" for c in str_classes])
|
|
print(cm_df.to_string())
|
|
|
|
# ── Simpan model ───────────────────────────────────────────────────────────────
|
|
model_path = MODEL_DIR / "xgboost_scenario3.pkl"
|
|
joblib.dump(best_model, model_path)
|
|
print(f"\n💾 Model disimpan: {model_path}")
|
|
print("\n✅ STEP 2 SELESAI — Lanjut ke step3_evaluation.py untuk analisis lengkap") |