TIFNGK_E41222719/src/utils/splitting_data.py

45 lines
1.4 KiB
Python

import pandas as pd
from sklearn.model_selection import train_test_split
nama_file = 'robust_data/dataset/dataset_fix_preprocessed.csv'
col_sentiment = 'Sentiment'
try:
df = pd.read_csv(nama_file)
df[col_sentiment] = df[col_sentiment].astype(str).str.strip().str.lower()
print(f"Distribusi data keseluruhan sebelum splitting:")
print(df[col_sentiment].value_counts())
print("-" * 30)
df_clean = df.drop_duplicates().reset_index(drop=True)
train, test = train_test_split(
df_clean,
test_size=0.20,
stratify=df_clean[col_sentiment],
random_state=42,
shuffle=True
)
print("\n" + "="*35)
print(f"HASIL SPLITTING AKHIR (Total: {len(df_clean)})")
print("="*35)
print(f"\n[1] DATA LATIH (80%) - Total: {len(train)} baris")
dist_train = train[col_sentiment].value_counts()
for kelas, jumlah in dist_train.items():
print(f" - {kelas.upper()}: {jumlah} baris")
print(f"\n[2] DATA UJI (20%) - Total: {len(test)} baris")
dist_test = test[col_sentiment].value_counts()
for kelas, jumlah in dist_test.items():
print(f" - {kelas.upper()}: {jumlah} baris")
train.to_csv('data_train_80.csv', index=False)
test.to_csv('data_test_20.csv', index=False)
print("\nSUKSES: File 'data_train_80.csv' dan 'data_test_20.csv' telah dibuat.")
except Exception as e:
print(f"Terjadi kesalahan: {e}")