E41222753_NinikYuniarsih_Ju.../scripts/plot_dataset_distribution.py

#!/usr/bin/env python3
"""Plot class distribution before and after undersampling.

Usage: python scripts/plot_dataset_distribution.py
"""
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def safe_read_csv(path):
    if not os.path.exists(path):
        print(f"File not found: {path}")
        sys.exit(1)
    return pd.read_csv(path)


def plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan'):
    # Select numeric feature columns (exclude name and label)
    numeric_cols = [c for c in before_df.columns if c.startswith('nilai_')]

    # Prepare data for PCA: combine both datasets to get consistent projection
    combined = pd.concat([before_df[numeric_cols], after_df[numeric_cols]], ignore_index=True)
    scaler = StandardScaler()
    combined_scaled = scaler.fit_transform(combined.values)

    pca = PCA(n_components=2)
    combined_2d = pca.fit_transform(combined_scaled)

    before_2d = combined_2d[: len(before_df)]
    after_2d = combined_2d[len(before_df) :]

    # plotting
    fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True, sharey=True)

    def draw_scatter(ax, points, labels, title):
        classes = sorted(labels.unique())
        cmap = plt.get_cmap('tab10')
        for i, cls in enumerate(classes):
            mask = labels == cls
            ax.scatter(points[mask, 0], points[mask, 1], s=40, alpha=0.8,
                       color=cmap(i % 10), label=f'Kelas {cls}')
        ax.set_title(title)
        ax.set_xlabel('PCA 1')
        ax.set_ylabel('PCA 2')
        ax.legend(loc='best', fontsize='small')

    draw_scatter(axes[0], before_2d, before_df[label_col], 'Sebelum Under-sampling')
    draw_scatter(axes[1], after_2d, after_df[label_col], 'Sesudah Under-sampling')

    fig.suptitle('Sebaran Titik Data (2D PCA) — Sebelum vs Sesudah Under-sampling')
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])

    out_dir = os.path.join(os.path.dirname(__file__), '..', 'data')
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, 'dataset_scatter_comparison.png')
    fig.savefig(out_path, dpi=150)
    print(f"Saved scatter comparison plot to: {out_path}")

    plt.show()


def main():
    repo_root = os.path.dirname(os.path.dirname(__file__))
    data_dir = os.path.join(repo_root, 'data')
    before_path = os.path.join(data_dir, 'dataset_smakom.csv')
    after_path = os.path.join(data_dir, 'dataset_smakom_final.csv')

    before_df = safe_read_csv(before_path)
    after_df = safe_read_csv(after_path)

    if 'paket_jurusan' not in before_df.columns or 'paket_jurusan' not in after_df.columns:
        print('Expected column "paket_jurusan" not found in one of the files.')
        sys.exit(1)

    plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan')


if __name__ == '__main__':
    main()