import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import KFold import warnings warnings.filterwarnings('ignore') class JobRecommendationSystem: def __init__(self): self.skill_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') self.interest_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') self.major_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') self.job_encoder = LabelEncoder() self.ipk_scaler = MinMaxScaler() self.is_fitted = False def preprocess_data(self, df): """Preprocess data untuk feature engineering""" # Copy data untuk menghindari modifikasi asli data = df.copy() # Normalisasi nama kolom data.columns = data.columns.str.strip() # Clean dan standardisasi text data['keterampilan'] = data['keterampilan'].str.strip().str.lower() data['minat'] = data['minat'].str.strip().str.lower() data['Jurusan'] = data['Jurusan'].str.strip().str.lower() data['dream job'] = data['dream job'].str.strip() return data def create_features(self, data, fit=False): """Membuat feature matrix dari data""" # Reshape untuk encoder skills = data['keterampilan'].values.reshape(-1, 1) interests = data['minat'].values.reshape(-1, 1) majors = data['Jurusan'].values.reshape(-1, 1) ipk = data['IPK'].values.reshape(-1, 1) if fit: # Fit encoders skill_features = self.skill_encoder.fit_transform(skills) interest_features = self.interest_encoder.fit_transform(interests) major_features = self.major_encoder.fit_transform(majors) ipk_features = self.ipk_scaler.fit_transform(ipk) else: # Transform only skill_features = self.skill_encoder.transform(skills) interest_features = self.interest_encoder.transform(interests) major_features = self.major_encoder.transform(majors) ipk_features = self.ipk_scaler.transform(ipk) # Gabungkan semua features features = np.hstack([ ipk_features, skill_features, interest_features, major_features ]) return features def fit(self, X_train, y_train): """Fit model dengan training data""" self.train_features = self.create_features(X_train, fit=True) self.train_jobs = self.job_encoder.fit_transform(y_train) self.train_job_names = y_train.values self.is_fitted = True def predict(self, X_test, k=3): """Predict jobs untuk test data menggunakan cosine similarity""" if not self.is_fitted: raise ValueError("Model belum di-fit!") test_features = self.create_features(X_test, fit=False) # Hitung cosine similarity similarities = cosine_similarity(test_features, self.train_features) predictions = [] for i, sim_scores in enumerate(similarities): # Ambil k tetangga terdekat top_k_indices = np.argsort(sim_scores)[::-1][:k] # Ambil job dari tetangga terdekat neighbor_jobs = [self.train_job_names[idx] for idx in top_k_indices] # Hitung frequency-based recommendation job_counts = {} for job in neighbor_jobs: job_counts[job] = job_counts.get(job, 0) + 1 # Ambil job dengan frekuensi tertinggi recommended_job = max(job_counts.keys(), key=job_counts.get) predictions.append(recommended_job) return predictions def predict_top_k(self, X_test, k=3, top_jobs=3): """Predict top-k jobs untuk test data""" if not self.is_fitted: raise ValueError("Model belum di-fit!") test_features = self.create_features(X_test, fit=False) similarities = cosine_similarity(test_features, self.train_features) predictions = [] for i, sim_scores in enumerate(similarities): top_k_indices = np.argsort(sim_scores)[::-1][:k] neighbor_jobs = [self.train_job_names[idx] for idx in top_k_indices] # Hitung job frequency job_counts = {} for job in neighbor_jobs: job_counts[job] = job_counts.get(job, 0) + 1 # Ambil top jobs berdasarkan frequency sorted_jobs = sorted(job_counts.items(), key=lambda x: x[1], reverse=True) top_recommended = [job for job, count in sorted_jobs[:top_jobs]] predictions.append(top_recommended) return predictions def evaluate_model(df, n_splits=5, k_neighbors=5): """Evaluasi model menggunakan 5-fold cross validation""" # Preprocess data model = JobRecommendationSystem() processed_data = model.preprocess_data(df) # Inisialisasi KFold kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) # Metrics untuk setiap fold fold_results = [] print("=" * 80) print("EVALUASI 5-FOLD CROSS-VALIDATION DENGAN COSINE SIMILARITY") print("=" * 80) print() for fold, (train_idx, test_idx) in enumerate(kf.split(processed_data), 1): print(f"Processing Fold {fold}...") # Split data train_data = processed_data.iloc[train_idx] test_data = processed_data.iloc[test_idx] X_train = train_data[['IPK', 'Jurusan', 'keterampilan', 'minat']] y_train = train_data['dream job'] X_test = test_data[['IPK', 'Jurusan', 'keterampilan', 'minat']] y_test = test_data['dream job'] # Fit dan predict fold_model = JobRecommendationSystem() fold_model.fit(X_train, y_train) # Single prediction (top-1) predictions_top1 = fold_model.predict(X_test, k=k_neighbors) # Top-3 predictions predictions_top3 = fold_model.predict_top_k(X_test, k=k_neighbors, top_jobs=3) # Hitung metrics # Accuracy (exact match) exact_matches = sum(1 for pred, actual in zip(predictions_top1, y_test) if pred.lower() == actual.lower()) accuracy = exact_matches / len(y_test) # Top-3 accuracy top3_matches = 0 for pred_list, actual in zip(predictions_top3, y_test): if any(pred.lower() == actual.lower() for pred in pred_list): top3_matches += 1 top3_accuracy = top3_matches / len(y_test) # Partial match (similar to original hit rate) partial_matches = 0 for pred, actual in zip(predictions_top1, y_test): pred_lower = pred.lower() actual_lower = actual.lower() if pred_lower in actual_lower or actual_lower in pred_lower: partial_matches += 1 partial_accuracy = partial_matches / len(y_test) # Simpan hasil fold fold_result = { 'fold': fold, 'train_size': len(train_data), 'test_size': len(test_data), 'exact_accuracy': accuracy, 'partial_accuracy': partial_accuracy, 'top3_accuracy': top3_accuracy, 'predictions': predictions_top1, 'actual': y_test.tolist() } fold_results.append(fold_result) # Print hasil fold print(f"Fold {fold} Results:") print(f" Train Size: {len(train_data)}, Test Size: {len(test_data)}") print(f" Exact Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)") print(f" Partial Accuracy: {partial_accuracy:.3f} ({partial_accuracy*100:.1f}%)") print(f" Top-3 Accuracy: {top3_accuracy:.3f} ({top3_accuracy*100:.1f}%)") print() return fold_results def analyze_results(fold_results): """Analisis hasil cross-validation""" print("=" * 80) print("RINGKASAN HASIL 5-FOLD CROSS-VALIDATION") print("=" * 80) # Hitung rata-rata metrics exact_accs = [result['exact_accuracy'] for result in fold_results] partial_accs = [result['partial_accuracy'] for result in fold_results] top3_accs = [result['top3_accuracy'] for result in fold_results] print(f"\nRata-rata Metrics:") print(f"Exact Accuracy: {np.mean(exact_accs):.3f} ± {np.std(exact_accs):.3f}") print(f"Partial Accuracy: {np.mean(partial_accs):.3f} ± {np.std(partial_accs):.3f}") print(f"Top-3 Accuracy: {np.mean(top3_accs):.3f} ± {np.std(top3_accs):.3f}") print(f"\nDetail per Fold:") print(f"{'Fold':<6} {'Exact':<8} {'Partial':<8} {'Top-3':<8}") print("-" * 35) for result in fold_results: print(f"{result['fold']:<6} {result['exact_accuracy']:.3f} {result['partial_accuracy']:.3f} {result['top3_accuracy']:.3f}") # Analisis error print(f"\nAnalisis Kesalahan Prediksi:") all_predictions = [] all_actual = [] for result in fold_results: all_predictions.extend(result['predictions']) all_actual.extend(result['actual']) # Hitung confusion untuk job yang sering salah error_analysis = {} for pred, actual in zip(all_predictions, all_actual): if pred.lower() != actual.lower(): key = f"'{actual}' → '{pred}'" error_analysis[key] = error_analysis.get(key, 0) + 1 print("\nKesalahan Prediksi Terbanyak:") sorted_errors = sorted(error_analysis.items(), key=lambda x: x[1], reverse=True) for error, count in sorted_errors[:10]: print(f" {error}: {count} kali") return { 'mean_exact_accuracy': np.mean(exact_accs), 'std_exact_accuracy': np.std(exact_accs), 'mean_partial_accuracy': np.mean(partial_accs), 'std_partial_accuracy': np.std(partial_accs), 'mean_top3_accuracy': np.mean(top3_accs), 'std_top3_accuracy': np.std(top3_accs), 'fold_results': fold_results } def main(): """Fungsi utama untuk menjalankan evaluasi""" # Load data try: df = pd.read_csv('student.csv') print(f"Data berhasil dimuat: {len(df)} records") print(f"Kolom: {list(df.columns)}") print() except FileNotFoundError: print("File student.csv tidak ditemukan!") return # Jalankan evaluasi fold_results = evaluate_model(df, n_splits=5, k_neighbors=5) # Analisis hasil summary = analyze_results(fold_results) print("\n" + "=" * 80) print("KESIMPULAN") print("=" * 80) print(f"Model Cosine Similarity dengan 5-fold CV menunjukkan:") print(f"• Exact Match Accuracy: {summary['mean_exact_accuracy']:.1%} ± {summary['std_exact_accuracy']:.1%}") print(f"• Partial Match Accuracy: {summary['mean_partial_accuracy']:.1%} ± {summary['std_partial_accuracy']:.1%}") print(f"• Top-3 Accuracy: {summary['mean_top3_accuracy']:.1%} ± {summary['std_top3_accuracy']:.1%}") print() print("Metode ini menggunakan kesamaan profil mahasiswa (IPK, jurusan, keterampilan, minat)") print("untuk merekomendasikan pekerjaan berdasarkan tetangga terdekat.") if __name__ == "__main__": main()