#!/usr/bin/env python3 """Plot class distribution before and after undersampling. Usage: python scripts/plot_dataset_distribution.py """ import os import sys import pandas as pd import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler def safe_read_csv(path): if not os.path.exists(path): print(f"File not found: {path}") sys.exit(1) return pd.read_csv(path) def plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan'): # Select numeric feature columns (exclude name and label) numeric_cols = [c for c in before_df.columns if c.startswith('nilai_')] # Prepare data for PCA: combine both datasets to get consistent projection combined = pd.concat([before_df[numeric_cols], after_df[numeric_cols]], ignore_index=True) scaler = StandardScaler() combined_scaled = scaler.fit_transform(combined.values) pca = PCA(n_components=2) combined_2d = pca.fit_transform(combined_scaled) before_2d = combined_2d[: len(before_df)] after_2d = combined_2d[len(before_df) :] # plotting fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True, sharey=True) def draw_scatter(ax, points, labels, title): classes = sorted(labels.unique()) cmap = plt.get_cmap('tab10') for i, cls in enumerate(classes): mask = labels == cls ax.scatter(points[mask, 0], points[mask, 1], s=40, alpha=0.8, color=cmap(i % 10), label=f'Kelas {cls}') ax.set_title(title) ax.set_xlabel('PCA 1') ax.set_ylabel('PCA 2') ax.legend(loc='best', fontsize='small') draw_scatter(axes[0], before_2d, before_df[label_col], 'Sebelum Under-sampling') draw_scatter(axes[1], after_2d, after_df[label_col], 'Sesudah Under-sampling') fig.suptitle('Sebaran Titik Data (2D PCA) — Sebelum vs Sesudah Under-sampling') fig.tight_layout(rect=[0, 0.03, 1, 0.95]) out_dir = os.path.join(os.path.dirname(__file__), '..', 'data') os.makedirs(out_dir, exist_ok=True) out_path = os.path.join(out_dir, 'dataset_scatter_comparison.png') fig.savefig(out_path, dpi=150) print(f"Saved scatter comparison plot to: {out_path}") plt.show() def main(): repo_root = os.path.dirname(os.path.dirname(__file__)) data_dir = os.path.join(repo_root, 'data') before_path = os.path.join(data_dir, 'dataset_smakom.csv') after_path = os.path.join(data_dir, 'dataset_smakom_final.csv') before_df = safe_read_csv(before_path) after_df = safe_read_csv(after_path) if 'paket_jurusan' not in before_df.columns or 'paket_jurusan' not in after_df.columns: print('Expected column "paket_jurusan" not found in one of the files.') sys.exit(1) plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan') if __name__ == '__main__': main()