TIFNGK_E41222719/src/flow_3/process_chisquare.py

67 lines
2.2 KiB
Python

import joblib
import os
from sklearn.feature_selection import SelectKBest, chi2
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
input_X_train = 'new_X_train_smote.pkl'
input_y_train = 'new_y_train_smote.pkl'
input_X_test = 'X_test_tfidf.pkl'
output_X_train = 'X_train_chi2.pkl'
output_X_test = 'X_test_chi2.pkl'
output_selector = 'chisquare_selector.pkl'
K_FEATURES = 1000
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
try:
print("1. Memuat data...")
X_train = joblib.load(os.path.join(base_dir, input_X_train))
y_train = joblib.load(os.path.join(base_dir, input_y_train))
X_test = joblib.load(os.path.join(base_dir, input_X_test))
print(f" - Dimensi Awal Train: {X_train.shape}")
print(f" - Dimensi Awal Test: {X_test.shape}")
total_features = X_train.shape[1]
print(f" - Total kata/fitur saat ini: {total_features}")
if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
k_final = 'all'
else:
k_final = K_FEATURES
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
selector = SelectKBest(score_func=chi2, k=k_final)
selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
print("\n3. Hasil Seleksi:")
print(f" - Dimensi Train Baru: {X_train_selected.shape}")
print(f" - Dimensi Test Baru: {X_test_selected.shape}")
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
print("\n4. Menyimpan hasil...")
joblib.dump(X_train_selected, output_X_train)
joblib.dump(X_test_selected, output_X_test)
joblib.dump(selector, output_selector)
print("="*40)
print(f"SUKSES! Data siap untuk Training XGBoost.")
print(f"File Train: {output_X_train}")
print(f"File Test: {output_X_test}")
print("="*40)
except FileNotFoundError as e:
print(f"ERROR: File tidak ditemukan ({e}). Pastikan script sebelumnya (SMOTE) sukses.")
except Exception as e:
print(f"ERROR: Terjadi kesalahan: {e}")