TIFNGK_E41222719/new_pipeline/eda.py

36 lines
991 B
Python

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.read_csv('robust_data/dataset/trimmed_sentiment_dataset.csv')
# Nama kolom target (sesuaikan)
target_col = 'label'
# 1. Distribusi kelas (deteksi imbalance)
print("Distribusi kelas:")
print(df[target_col].value_counts())
print(df[target_col].value_counts(normalize=True) * 100)
df[target_col].value_counts().plot(kind='bar', title='Distribusi Kelas')
plt.tight_layout()
plt.show()
# 2. Cek nilai hilang
missing = df.isnull().sum()
print("\nNilai hilang per kolom:")
print(missing[missing > 0])
# 3. Distribusi fitur numerik
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
df[num_cols].hist(bins=20, figsize=(14, 8))
plt.suptitle('Distribusi Fitur Numerik')
plt.tight_layout()
plt.show()
# 4. Boxplot untuk deteksi outlier
for col in num_cols[:6]: # tampilkan 6 pertama
plt.figure(figsize=(6, 3))
sns.boxplot(x=df[col])
plt.title(f'Boxplot: {col}')
plt.show()