84 lines
2.9 KiB
Python
84 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Plot class distribution before and after undersampling.
|
|
|
|
Usage: python scripts/plot_dataset_distribution.py
|
|
"""
|
|
import os
|
|
import sys
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
|
def safe_read_csv(path):
|
|
if not os.path.exists(path):
|
|
print(f"File not found: {path}")
|
|
sys.exit(1)
|
|
return pd.read_csv(path)
|
|
|
|
|
|
def plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan'):
|
|
# Select numeric feature columns (exclude name and label)
|
|
numeric_cols = [c for c in before_df.columns if c.startswith('nilai_')]
|
|
|
|
# Prepare data for PCA: combine both datasets to get consistent projection
|
|
combined = pd.concat([before_df[numeric_cols], after_df[numeric_cols]], ignore_index=True)
|
|
scaler = StandardScaler()
|
|
combined_scaled = scaler.fit_transform(combined.values)
|
|
|
|
pca = PCA(n_components=2)
|
|
combined_2d = pca.fit_transform(combined_scaled)
|
|
|
|
before_2d = combined_2d[: len(before_df)]
|
|
after_2d = combined_2d[len(before_df) :]
|
|
|
|
# plotting
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharex=True, sharey=True)
|
|
|
|
def draw_scatter(ax, points, labels, title):
|
|
classes = sorted(labels.unique())
|
|
cmap = plt.get_cmap('tab10')
|
|
for i, cls in enumerate(classes):
|
|
mask = labels == cls
|
|
ax.scatter(points[mask, 0], points[mask, 1], s=40, alpha=0.8,
|
|
color=cmap(i % 10), label=f'Kelas {cls}')
|
|
ax.set_title(title)
|
|
ax.set_xlabel('PCA 1')
|
|
ax.set_ylabel('PCA 2')
|
|
ax.legend(loc='best', fontsize='small')
|
|
|
|
draw_scatter(axes[0], before_2d, before_df[label_col], 'Sebelum Under-sampling')
|
|
draw_scatter(axes[1], after_2d, after_df[label_col], 'Sesudah Under-sampling')
|
|
|
|
fig.suptitle('Sebaran Titik Data (2D PCA) — Sebelum vs Sesudah Under-sampling')
|
|
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
|
|
|
|
out_dir = os.path.join(os.path.dirname(__file__), '..', 'data')
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
out_path = os.path.join(out_dir, 'dataset_scatter_comparison.png')
|
|
fig.savefig(out_path, dpi=150)
|
|
print(f"Saved scatter comparison plot to: {out_path}")
|
|
|
|
plt.show()
|
|
|
|
|
|
def main():
|
|
repo_root = os.path.dirname(os.path.dirname(__file__))
|
|
data_dir = os.path.join(repo_root, 'data')
|
|
before_path = os.path.join(data_dir, 'dataset_smakom.csv')
|
|
after_path = os.path.join(data_dir, 'dataset_smakom_final.csv')
|
|
|
|
before_df = safe_read_csv(before_path)
|
|
after_df = safe_read_csv(after_path)
|
|
|
|
if 'paket_jurusan' not in before_df.columns or 'paket_jurusan' not in after_df.columns:
|
|
print('Expected column "paket_jurusan" not found in one of the files.')
|
|
sys.exit(1)
|
|
|
|
plot_distribution_scatter(before_df, after_df, label_col='paket_jurusan')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|