67 lines
2.2 KiB
Python
67 lines
2.2 KiB
Python
import joblib
|
|
import os
|
|
from sklearn.feature_selection import SelectKBest, chi2
|
|
|
|
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
input_X_train = 'new_X_train_smote.pkl'
|
|
input_y_train = 'new_y_train_smote.pkl'
|
|
input_X_test = 'X_test_tfidf.pkl'
|
|
|
|
output_X_train = 'X_train_chi2.pkl'
|
|
output_X_test = 'X_test_chi2.pkl'
|
|
output_selector = 'chisquare_selector.pkl'
|
|
|
|
K_FEATURES = 1000
|
|
|
|
print("--- MEMULAI FEATURE SELECTION (CHI-SQUARE) ---")
|
|
|
|
try:
|
|
print("1. Memuat data...")
|
|
X_train = joblib.load(os.path.join(base_dir, input_X_train))
|
|
y_train = joblib.load(os.path.join(base_dir, input_y_train))
|
|
|
|
X_test = joblib.load(os.path.join(base_dir, input_X_test))
|
|
|
|
print(f" - Dimensi Awal Train: {X_train.shape}")
|
|
print(f" - Dimensi Awal Test: {X_test.shape}")
|
|
|
|
total_features = X_train.shape[1]
|
|
print(f" - Total kata/fitur saat ini: {total_features}")
|
|
|
|
if isinstance(K_FEATURES, int) and K_FEATURES > total_features:
|
|
print(f" ⚠️ WARNING: Target k={K_FEATURES} lebih besar dari total fitur ({total_features}). Mengambil semua fitur.")
|
|
k_final = 'all'
|
|
else:
|
|
k_final = K_FEATURES
|
|
|
|
print(f"\n2. Menjalankan Chi-Square (Mengambil Top {k_final} Fitur)...")
|
|
|
|
selector = SelectKBest(score_func=chi2, k=k_final)
|
|
|
|
selector.fit(X_train, y_train)
|
|
|
|
X_train_selected = selector.transform(X_train)
|
|
X_test_selected = selector.transform(X_test)
|
|
|
|
print("\n3. Hasil Seleksi:")
|
|
print(f" - Dimensi Train Baru: {X_train_selected.shape}")
|
|
print(f" - Dimensi Test Baru: {X_test_selected.shape}")
|
|
|
|
print(" - Proses seleksi selesai. Dimensi kolom (fitur) telah berkurang.")
|
|
|
|
print("\n4. Menyimpan hasil...")
|
|
joblib.dump(X_train_selected, output_X_train)
|
|
joblib.dump(X_test_selected, output_X_test)
|
|
joblib.dump(selector, output_selector)
|
|
|
|
print("="*40)
|
|
print(f"SUKSES! Data siap untuk Training XGBoost.")
|
|
print(f"File Train: {output_X_train}")
|
|
print(f"File Test: {output_X_test}")
|
|
print("="*40)
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"ERROR: File tidak ditemukan ({e}). Pastikan script sebelumnya (SMOTE) sukses.")
|
|
except Exception as e:
|
|
print(f"ERROR: Terjadi kesalahan: {e}") |