TIFNGK_E41222719/data_collect.py

82 lines
2.3 KiB
Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
# Muat dataset (ganti path sesuai file Anda)
df = pd.read_csv('robust_data/dataset/trimmed_sentiment_dataset.csv')
# Tampilkan ukuran dataset
print(f"Shape: {df.shape}")
# Preview 5 baris pertama
print(df.head())
# Ringkasan tipe data dan nilai kosong
print(df.info())
# Statistik deskriptif
print(df.describe(include='all'))
# Nama kolom target (sesuaikan)
target_col = 'label'
# 1. Distribusi kelas (deteksi imbalance)
print("Distribusi kelas:")
print(df[target_col].value_counts())
print(df[target_col].value_counts(normalize=True) * 100)
df[target_col].value_counts().plot(kind='bar', title='Distribusi Kelas')
plt.tight_layout()
plt.show()
# 2. Cek nilai hilang
missing = df.isnull().sum()
print("\nNilai hilang per kolom:")
print(missing[missing > 0])
# 3. Distribusi fitur numerik
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
df[num_cols].hist(bins=20, figsize=(14, 8))
plt.suptitle('Distribusi Fitur Numerik')
plt.tight_layout()
plt.show()
# 4. Boxplot untuk deteksi outlier
for col in num_cols[:6]: # tampilkan 6 pertama
plt.figure(figsize=(6, 3))
sns.boxplot(x=df[col])
plt.title(f'Boxplot: {col}')
plt.show()
# Pisahkan fitur dan target
X = df.drop(columns=[target_col])
y = df[target_col].copy()
# Identifikasi kolom berdasarkan tipe
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
# 1. Imputasi nilai hilang
imp_num = SimpleImputer(strategy='median') # numerik: median (robust outlier)
imp_cat = SimpleImputer(strategy='most_frequent') # kategorik: modus
X[num_cols] = imp_num.fit_transform(X[num_cols])
X[cat_cols] = imp_cat.fit_transform(X[cat_cols])
# 2. Label encoding kolom kategorik
le_dict = {}
for col in cat_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str))
le_dict[col] = le # simpan untuk inverse transform nanti
# 3. Encode target jika berupa string
if y.dtype == 'object':
le_target = LabelEncoder()
y = le_target.fit_transform(y)
print(f"Fitur setelah preprocessing: {X.shape}")
print(X.head())