45 lines
1.4 KiB
Python
45 lines
1.4 KiB
Python
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
nama_file = 'robust_data/dataset/dataset_fix_preprocessed.csv'
|
|
col_sentiment = 'Sentiment'
|
|
|
|
try:
|
|
df = pd.read_csv(nama_file)
|
|
|
|
df[col_sentiment] = df[col_sentiment].astype(str).str.strip().str.lower()
|
|
|
|
print(f"Distribusi data keseluruhan sebelum splitting:")
|
|
print(df[col_sentiment].value_counts())
|
|
print("-" * 30)
|
|
|
|
df_clean = df.drop_duplicates().reset_index(drop=True)
|
|
|
|
train, test = train_test_split(
|
|
df_clean,
|
|
test_size=0.20,
|
|
stratify=df_clean[col_sentiment],
|
|
random_state=42,
|
|
shuffle=True
|
|
)
|
|
|
|
print("\n" + "="*35)
|
|
print(f"HASIL SPLITTING AKHIR (Total: {len(df_clean)})")
|
|
print("="*35)
|
|
|
|
print(f"\n[1] DATA LATIH (80%) - Total: {len(train)} baris")
|
|
dist_train = train[col_sentiment].value_counts()
|
|
for kelas, jumlah in dist_train.items():
|
|
print(f" - {kelas.upper()}: {jumlah} baris")
|
|
|
|
print(f"\n[2] DATA UJI (20%) - Total: {len(test)} baris")
|
|
dist_test = test[col_sentiment].value_counts()
|
|
for kelas, jumlah in dist_test.items():
|
|
print(f" - {kelas.upper()}: {jumlah} baris")
|
|
|
|
train.to_csv('data_train_80.csv', index=False)
|
|
test.to_csv('data_test_20.csv', index=False)
|
|
print("\nSUKSES: File 'data_train_80.csv' dan 'data_test_20.csv' telah dibuat.")
|
|
|
|
except Exception as e:
|
|
print(f"Terjadi kesalahan: {e}") |