TIFNGK_E41222719/new_pipeline/chi2.py

127 lines
5.1 KiB
Python

"""
STEP 1 — Seleksi Fitur dengan Chi-Square
=========================================
Urutan yang benar:
Chi-Square HARUS dijalankan SEBELUM SMOTE dan SEBELUM GridSearch.
Alasan:
- Chi-Square mengukur hubungan fitur asli vs target asli (distribusi nyata).
- Jika Chi-Square dijalankan SETELAH SMOTE, fitur dipilih berdasarkan data
sintetis yang tidak merepresentasikan distribusi sesungguhnya → bias seleksi.
- Hasilnya disimpan ke disk agar step2 dan step3 bisa memuat langsung.
Output:
- X_train_selected.pkl → data train setelah seleksi fitur
- X_test_selected.pkl → data test setelah seleksi fitur (transform only!)
- selector.pkl → objek SelectKBest yang sudah fit
- selected_feature_report.csv → skor chi2 dan p-value tiap fitur terpilih
"""
import joblib
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.sparse import issparse
from sklearn.feature_selection import SelectKBest, chi2
# ── Path konfigurasi ───────────────────────────────────────────────────────────
SCRIPT_DIR = Path("robust_data")
TOK_DIR = SCRIPT_DIR / "tokenize"
OUT_DIR = SCRIPT_DIR / "selected"
OUT_DIR.mkdir(parents=True, exist_ok=True)
PATHS = {
"X_train": TOK_DIR / "X_train_tfidf.pkl",
"y_train": TOK_DIR / "y_train.pkl",
"X_test": TOK_DIR / "X_test_tfidf.pkl",
"y_test": TOK_DIR / "y_test.pkl",
"le": TOK_DIR / "label_encoder.pkl",
}
K_FEATURES = 2000 # jumlah fitur terbaik yang dipilih
# ── Muat data ──────────────────────────────────────────────────────────────────
print("=" * 55)
print("STEP 1 — CHI-SQUARE FEATURE SELECTION")
print("=" * 55)
data = {}
for name, path in PATHS.items():
if not path.exists():
print(f"❌ File tidak ditemukan: {path}")
sys.exit(1)
data[name] = joblib.load(path)
print(f"✅ Loaded: {path.name}")
X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]
le = data["le"]
print(f"\nDimensi awal X_train : {X_train.shape}")
print(f"Dimensi awal X_test : {X_test.shape}")
# ── Validasi: Chi-Square butuh nilai non-negatif ───────────────────────────────
# TF-IDF secara definisi selalu >= 0, tapi kita cek tetap untuk keamanan.
if issparse(X_train):
min_val = X_train.min()
else:
min_val = np.min(X_train)
if min_val < 0:
print(f"\n⚠️ Nilai minimum: {min_val:.4f} — menggeser ke non-negatif.")
X_train = X_train - min_val
X_test = X_test - min_val
else:
print(f"\n✅ Nilai minimum {min_val:.4f} — sudah non-negatif, aman untuk Chi-Square.")
# ── Fit SelectKBest pada DATA TRAIN ASLI (sebelum SMOTE) ──────────────────────
print(f"\nFit SelectKBest(chi2, k={K_FEATURES}) pada X_train asli...")
selector = SelectKBest(score_func=chi2, k=K_FEATURES)
# fit HANYA pada X_train (distribusi asli, bukan sintetis)
selector.fit(X_train, y_train)
# transform X_train dan X_test
X_train_sel = selector.transform(X_train)
X_test_sel = selector.transform(X_test) # transform only! tidak fit ulang
print(f"Dimensi setelah seleksi X_train : {X_train_sel.shape}")
print(f"Dimensi setelah seleksi X_test : {X_test_sel.shape}")
# ── Laporan skor fitur terpilih ────────────────────────────────────────────────
support_mask = selector.get_support()
all_scores = selector.scores_
all_pvalues = selector.pvalues_
report_df = pd.DataFrame({
"fitur_index": np.where(support_mask)[0],
"chi2_score" : all_scores[support_mask],
"p_value" : all_pvalues[support_mask],
}).sort_values("chi2_score", ascending=False).reset_index(drop=True)
report_path = OUT_DIR / "selected_feature_report.csv"
report_df.to_csv(report_path, index=False)
print(f"\nTop 10 fitur berdasarkan skor Chi-Square:")
print(report_df.head(10).to_string(index=False))
print(f"\nLaporan lengkap disimpan: {report_path}")
# Fitur yang dibuang (p-value tinggi = tidak signifikan)
n_dropped = X_train.shape[1] - K_FEATURES
print(f"\nFitur dibuang : {n_dropped:,} (tidak signifikan secara statistik)")
print(f"Fitur tersisa : {K_FEATURES:,}")
# ── Simpan output ──────────────────────────────────────────────────────────────
joblib.dump(X_train_sel, OUT_DIR / "X_train_selected.pkl")
joblib.dump(X_test_sel, OUT_DIR / "X_test_selected.pkl")
joblib.dump(selector, OUT_DIR / "selector.pkl")
# Salin y dan le agar step2/3 tidak perlu akses ke folder tokenize
joblib.dump(y_train, OUT_DIR / "y_train.pkl")
joblib.dump(y_test, OUT_DIR / "y_test.pkl")
joblib.dump(le, OUT_DIR / "label_encoder.pkl")
print(f"\n💾 Semua output disimpan di: {OUT_DIR}")
print("\n✅ STEP 1 SELESAI — Lanjut ke step2_gridsearch.py")