E41222753_NinikYuniarsih_Ju.../scripts/plot_dataset_distribution.py

84 lines
2.9 KiB
Python

#!/usr/bin/env python3
"""Plot class distribution before and after undersampling.
Usage: python scripts/plot_dataset_distribution.py
"""
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
def safe_read_csv(path):
if not os.path.exists(path):
print(f"File not found: {path}")
sys.exit(1)
return pd.read_csv(path)
def plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan'):
# Select numeric feature columns (exclude name and label)
numeric_cols = [c for c in before_df.columns if c.startswith('nilai_')]
# Prepare data for PCA: combine both datasets to get consistent projection
combined = pd.concat([before_df[numeric_cols], after_df[numeric_cols]], ignore_index=True)
scaler = StandardScaler()
combined_scaled = scaler.fit_transform(combined.values)
pca = PCA(n_components=2)
combined_2d = pca.fit_transform(combined_scaled)
before_2d = combined_2d[: len(before_df)]
after_2d = combined_2d[len(before_df) :]
# plotting
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True, sharey=True)
def draw_scatter(ax, points, labels, title):
classes = sorted(labels.unique())
cmap = plt.get_cmap('tab10')
for i, cls in enumerate(classes):
mask = labels == cls
ax.scatter(points[mask, 0], points[mask, 1], s=40, alpha=0.8,
color=cmap(i % 10), label=f'Kelas {cls}')
ax.set_title(title)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.legend(loc='best', fontsize='small')
draw_scatter(axes[0], before_2d, before_df[label_col], 'Sebelum Under-sampling')
draw_scatter(axes[1], after_2d, after_df[label_col], 'Sesudah Under-sampling')
fig.suptitle('Sebaran Titik Data (2D PCA) — Sebelum vs Sesudah Under-sampling')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
out_dir = os.path.join(os.path.dirname(__file__), '..', 'data')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'dataset_scatter_comparison.png')
fig.savefig(out_path, dpi=150)
print(f"Saved scatter comparison plot to: {out_path}")
plt.show()
def main():
repo_root = os.path.dirname(os.path.dirname(__file__))
data_dir = os.path.join(repo_root, 'data')
before_path = os.path.join(data_dir, 'dataset_smakom.csv')
after_path = os.path.join(data_dir, 'dataset_smakom_final.csv')
before_df = safe_read_csv(before_path)
after_df = safe_read_csv(after_path)
if 'paket_jurusan' not in before_df.columns or 'paket_jurusan' not in after_df.columns:
print('Expected column "paket_jurusan" not found in one of the files.')
sys.exit(1)
plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan')
if __name__ == '__main__':
main()