299 lines
11 KiB
Python
299 lines
11 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from sklearn.model_selection import KFold
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
class JobRecommendationSystem:
|
|
def __init__(self):
|
|
self.skill_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
|
self.interest_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
|
self.major_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
|
self.job_encoder = LabelEncoder()
|
|
self.ipk_scaler = MinMaxScaler()
|
|
self.is_fitted = False
|
|
|
|
def preprocess_data(self, df):
|
|
"""Preprocess data untuk feature engineering"""
|
|
# Copy data untuk menghindari modifikasi asli
|
|
data = df.copy()
|
|
|
|
# Normalisasi nama kolom
|
|
data.columns = data.columns.str.strip()
|
|
|
|
# Clean dan standardisasi text
|
|
data['keterampilan'] = data['keterampilan'].str.strip().str.lower()
|
|
data['minat'] = data['minat'].str.strip().str.lower()
|
|
data['Jurusan'] = data['Jurusan'].str.strip().str.lower()
|
|
data['dream job'] = data['dream job'].str.strip()
|
|
|
|
return data
|
|
|
|
def create_features(self, data, fit=False):
|
|
"""Membuat feature matrix dari data"""
|
|
# Reshape untuk encoder
|
|
skills = data['keterampilan'].values.reshape(-1, 1)
|
|
interests = data['minat'].values.reshape(-1, 1)
|
|
majors = data['Jurusan'].values.reshape(-1, 1)
|
|
ipk = data['IPK'].values.reshape(-1, 1)
|
|
|
|
if fit:
|
|
# Fit encoders
|
|
skill_features = self.skill_encoder.fit_transform(skills)
|
|
interest_features = self.interest_encoder.fit_transform(interests)
|
|
major_features = self.major_encoder.fit_transform(majors)
|
|
ipk_features = self.ipk_scaler.fit_transform(ipk)
|
|
else:
|
|
# Transform only
|
|
skill_features = self.skill_encoder.transform(skills)
|
|
interest_features = self.interest_encoder.transform(interests)
|
|
major_features = self.major_encoder.transform(majors)
|
|
ipk_features = self.ipk_scaler.transform(ipk)
|
|
|
|
# Gabungkan semua features
|
|
features = np.hstack([
|
|
ipk_features,
|
|
skill_features,
|
|
interest_features,
|
|
major_features
|
|
])
|
|
|
|
return features
|
|
|
|
def fit(self, X_train, y_train):
|
|
"""Fit model dengan training data"""
|
|
self.train_features = self.create_features(X_train, fit=True)
|
|
self.train_jobs = self.job_encoder.fit_transform(y_train)
|
|
self.train_job_names = y_train.values
|
|
self.is_fitted = True
|
|
|
|
def predict(self, X_test, k=3):
|
|
"""Predict jobs untuk test data menggunakan cosine similarity"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model belum di-fit!")
|
|
|
|
test_features = self.create_features(X_test, fit=False)
|
|
|
|
# Hitung cosine similarity
|
|
similarities = cosine_similarity(test_features, self.train_features)
|
|
|
|
predictions = []
|
|
|
|
for i, sim_scores in enumerate(similarities):
|
|
# Ambil k tetangga terdekat
|
|
top_k_indices = np.argsort(sim_scores)[::-1][:k]
|
|
|
|
# Ambil job dari tetangga terdekat
|
|
neighbor_jobs = [self.train_job_names[idx] for idx in top_k_indices]
|
|
|
|
# Hitung frequency-based recommendation
|
|
job_counts = {}
|
|
for job in neighbor_jobs:
|
|
job_counts[job] = job_counts.get(job, 0) + 1
|
|
|
|
# Ambil job dengan frekuensi tertinggi
|
|
recommended_job = max(job_counts.keys(), key=job_counts.get)
|
|
predictions.append(recommended_job)
|
|
|
|
return predictions
|
|
|
|
def predict_top_k(self, X_test, k=3, top_jobs=3):
|
|
"""Predict top-k jobs untuk test data"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model belum di-fit!")
|
|
|
|
test_features = self.create_features(X_test, fit=False)
|
|
similarities = cosine_similarity(test_features, self.train_features)
|
|
|
|
predictions = []
|
|
|
|
for i, sim_scores in enumerate(similarities):
|
|
top_k_indices = np.argsort(sim_scores)[::-1][:k]
|
|
neighbor_jobs = [self.train_job_names[idx] for idx in top_k_indices]
|
|
|
|
# Hitung job frequency
|
|
job_counts = {}
|
|
for job in neighbor_jobs:
|
|
job_counts[job] = job_counts.get(job, 0) + 1
|
|
|
|
# Ambil top jobs berdasarkan frequency
|
|
sorted_jobs = sorted(job_counts.items(), key=lambda x: x[1], reverse=True)
|
|
top_recommended = [job for job, count in sorted_jobs[:top_jobs]]
|
|
|
|
predictions.append(top_recommended)
|
|
|
|
return predictions
|
|
|
|
def evaluate_model(df, n_splits=5, k_neighbors=5):
|
|
"""Evaluasi model menggunakan 5-fold cross validation"""
|
|
|
|
# Preprocess data
|
|
model = JobRecommendationSystem()
|
|
processed_data = model.preprocess_data(df)
|
|
|
|
# Inisialisasi KFold
|
|
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
|
|
|
|
# Metrics untuk setiap fold
|
|
fold_results = []
|
|
|
|
print("=" * 80)
|
|
print("EVALUASI 5-FOLD CROSS-VALIDATION DENGAN COSINE SIMILARITY")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
for fold, (train_idx, test_idx) in enumerate(kf.split(processed_data), 1):
|
|
print(f"Processing Fold {fold}...")
|
|
|
|
# Split data
|
|
train_data = processed_data.iloc[train_idx]
|
|
test_data = processed_data.iloc[test_idx]
|
|
|
|
X_train = train_data[['IPK', 'Jurusan', 'keterampilan', 'minat']]
|
|
y_train = train_data['dream job']
|
|
X_test = test_data[['IPK', 'Jurusan', 'keterampilan', 'minat']]
|
|
y_test = test_data['dream job']
|
|
|
|
# Fit dan predict
|
|
fold_model = JobRecommendationSystem()
|
|
fold_model.fit(X_train, y_train)
|
|
|
|
# Single prediction (top-1)
|
|
predictions_top1 = fold_model.predict(X_test, k=k_neighbors)
|
|
|
|
# Top-3 predictions
|
|
predictions_top3 = fold_model.predict_top_k(X_test, k=k_neighbors, top_jobs=3)
|
|
|
|
# Hitung metrics
|
|
# Accuracy (exact match)
|
|
exact_matches = sum(1 for pred, actual in zip(predictions_top1, y_test)
|
|
if pred.lower() == actual.lower())
|
|
accuracy = exact_matches / len(y_test)
|
|
|
|
# Top-3 accuracy
|
|
top3_matches = 0
|
|
for pred_list, actual in zip(predictions_top3, y_test):
|
|
if any(pred.lower() == actual.lower() for pred in pred_list):
|
|
top3_matches += 1
|
|
top3_accuracy = top3_matches / len(y_test)
|
|
|
|
# Partial match (similar to original hit rate)
|
|
partial_matches = 0
|
|
for pred, actual in zip(predictions_top1, y_test):
|
|
pred_lower = pred.lower()
|
|
actual_lower = actual.lower()
|
|
if pred_lower in actual_lower or actual_lower in pred_lower:
|
|
partial_matches += 1
|
|
partial_accuracy = partial_matches / len(y_test)
|
|
|
|
# Simpan hasil fold
|
|
fold_result = {
|
|
'fold': fold,
|
|
'train_size': len(train_data),
|
|
'test_size': len(test_data),
|
|
'exact_accuracy': accuracy,
|
|
'partial_accuracy': partial_accuracy,
|
|
'top3_accuracy': top3_accuracy,
|
|
'predictions': predictions_top1,
|
|
'actual': y_test.tolist()
|
|
}
|
|
fold_results.append(fold_result)
|
|
|
|
# Print hasil fold
|
|
print(f"Fold {fold} Results:")
|
|
print(f" Train Size: {len(train_data)}, Test Size: {len(test_data)}")
|
|
print(f" Exact Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
|
|
print(f" Partial Accuracy: {partial_accuracy:.3f} ({partial_accuracy*100:.1f}%)")
|
|
print(f" Top-3 Accuracy: {top3_accuracy:.3f} ({top3_accuracy*100:.1f}%)")
|
|
print()
|
|
|
|
return fold_results
|
|
|
|
def analyze_results(fold_results):
|
|
"""Analisis hasil cross-validation"""
|
|
|
|
print("=" * 80)
|
|
print("RINGKASAN HASIL 5-FOLD CROSS-VALIDATION")
|
|
print("=" * 80)
|
|
|
|
# Hitung rata-rata metrics
|
|
exact_accs = [result['exact_accuracy'] for result in fold_results]
|
|
partial_accs = [result['partial_accuracy'] for result in fold_results]
|
|
top3_accs = [result['top3_accuracy'] for result in fold_results]
|
|
|
|
print(f"\nRata-rata Metrics:")
|
|
print(f"Exact Accuracy: {np.mean(exact_accs):.3f} ± {np.std(exact_accs):.3f}")
|
|
print(f"Partial Accuracy: {np.mean(partial_accs):.3f} ± {np.std(partial_accs):.3f}")
|
|
print(f"Top-3 Accuracy: {np.mean(top3_accs):.3f} ± {np.std(top3_accs):.3f}")
|
|
|
|
print(f"\nDetail per Fold:")
|
|
print(f"{'Fold':<6} {'Exact':<8} {'Partial':<8} {'Top-3':<8}")
|
|
print("-" * 35)
|
|
for result in fold_results:
|
|
print(f"{result['fold']:<6} {result['exact_accuracy']:.3f} {result['partial_accuracy']:.3f} {result['top3_accuracy']:.3f}")
|
|
|
|
# Analisis error
|
|
print(f"\nAnalisis Kesalahan Prediksi:")
|
|
all_predictions = []
|
|
all_actual = []
|
|
|
|
for result in fold_results:
|
|
all_predictions.extend(result['predictions'])
|
|
all_actual.extend(result['actual'])
|
|
|
|
# Hitung confusion untuk job yang sering salah
|
|
error_analysis = {}
|
|
for pred, actual in zip(all_predictions, all_actual):
|
|
if pred.lower() != actual.lower():
|
|
key = f"'{actual}' → '{pred}'"
|
|
error_analysis[key] = error_analysis.get(key, 0) + 1
|
|
|
|
print("\nKesalahan Prediksi Terbanyak:")
|
|
sorted_errors = sorted(error_analysis.items(), key=lambda x: x[1], reverse=True)
|
|
for error, count in sorted_errors[:10]:
|
|
print(f" {error}: {count} kali")
|
|
|
|
return {
|
|
'mean_exact_accuracy': np.mean(exact_accs),
|
|
'std_exact_accuracy': np.std(exact_accs),
|
|
'mean_partial_accuracy': np.mean(partial_accs),
|
|
'std_partial_accuracy': np.std(partial_accs),
|
|
'mean_top3_accuracy': np.mean(top3_accs),
|
|
'std_top3_accuracy': np.std(top3_accs),
|
|
'fold_results': fold_results
|
|
}
|
|
|
|
def main():
|
|
"""Fungsi utama untuk menjalankan evaluasi"""
|
|
|
|
# Load data
|
|
try:
|
|
df = pd.read_csv('student.csv')
|
|
print(f"Data berhasil dimuat: {len(df)} records")
|
|
print(f"Kolom: {list(df.columns)}")
|
|
print()
|
|
except FileNotFoundError:
|
|
print("File student.csv tidak ditemukan!")
|
|
return
|
|
|
|
# Jalankan evaluasi
|
|
fold_results = evaluate_model(df, n_splits=5, k_neighbors=5)
|
|
|
|
# Analisis hasil
|
|
summary = analyze_results(fold_results)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("KESIMPULAN")
|
|
print("=" * 80)
|
|
print(f"Model Cosine Similarity dengan 5-fold CV menunjukkan:")
|
|
print(f"• Exact Match Accuracy: {summary['mean_exact_accuracy']:.1%} ± {summary['std_exact_accuracy']:.1%}")
|
|
print(f"• Partial Match Accuracy: {summary['mean_partial_accuracy']:.1%} ± {summary['std_partial_accuracy']:.1%}")
|
|
print(f"• Top-3 Accuracy: {summary['mean_top3_accuracy']:.1%} ± {summary['std_top3_accuracy']:.1%}")
|
|
print()
|
|
print("Metode ini menggunakan kesamaan profil mahasiswa (IPK, jurusan, keterampilan, minat)")
|
|
print("untuk merekomendasikan pekerjaan berdasarkan tetangga terdekat.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |