import pandas as pd import numpy as np import re import pickle from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory import nltk import warnings import os warnings.filterwarnings('ignore') nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data")) # Download NLTK requirements try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') class SentimentAnalyzer: def __init__(self): # Inisialisasi stemmer bahasa Indonesia factory = StemmerFactory() self.stemmer = factory.create_stemmer() # Inisialisasi stopwords bahasa Indonesia stop_factory = StopWordRemoverFactory() self.stop_words = set(stop_factory.get_stop_words()) # Tambahan stopwords khusus additional_stopwords = { 'yg', 'dgn', 'nya', 'kalo', 'kalau', 'udah', 'udh', 'dah', 'lg', 'lagi', 'banget', 'bgt', 'emang', 'memang', 'sih', 'aja', 'doang', 'nih', 'nah', 'lah', 'deh', 'dong', 'kok', 'ya', 'yah', 'wkwk', 'haha', 'hihi', 'huhu', 'hehe' } self.stop_words.update(additional_stopwords) # Hapus kata negasi dari stopwords agar tidak ikut dibuang # (penting untuk menjaga makna kalimat seperti "tidak bagus", "bukan salah", dll.) negation_words = { 'tidak', 'bukan', 'belum', 'jangan', 'tak', 'tanpa', 'kurang', 'jarang', 'hampir', 'nyaris' } self.stop_words -= negation_words # TF-IDF Vectorizer self.vectorizer = TfidfVectorizer( max_features=5000, ngram_range=(1, 2), lowercase=True, strip_accents='unicode' ) # SVM Model (Kernel RBF) self.model = SVC(kernel='rbf', C=1.0, gamma=1, probability=True) # Kamus normalisasi bahasa Indonesia self.normalization_dict = { 'yg': 'yang', 'dgn': 'dengan', 'krn': 'karena', 'krna': 'karena', 'tp': 'tapi', 'tpi': 'tapi', 'gk': 'tidak', 'ga': 'tidak', 'gak': 'tidak', 'ngga': 'tidak', 'nggak': 'tidak', 'g': 'tidak', 'tdk': 'tidak', 'gitu': 'begitu', 'gt': 'begitu', 'gmn': 'bagaimana', 'gimana': 'bagaimana', 'dmn': 'dimana', 'kmn': 'kemana', 'knp': 'kenapa', 'knapa': 'kenapa', 'org': 'orang', 'orng': 'orang', 'tmn': 'teman', 'temen': 'teman', 'bgmn': 'bagaimana', 'bgt': 'banget', 'banget': 'sangat', 'bener': 'benar', 'bnr': 'benar', 'bnyk': 'banyak', 'bnyak': 'banyak', 'udh': 'sudah', 'udah': 'sudah', 'dah': 'sudah', 'telah': 'sudah', 'blm': 'belum', 'blom': 'belum', 'msh': 'masih', 'msih': 'masih', 'lg': 'lagi', 'lgi': 'lagi', 'skrg': 'sekarang', 'skrang': 'sekarang', 'skg': 'sekarang', 'nanti': 'nanti', 'ntar': 'nanti', 'tar': 'nanti', 'bsk': 'besok', 'besok': 'besok', 'kmrn': 'kemarin', 'kmarin': 'kemarin', 'hrs': 'harus', 'kudu': 'harus', 'mesti': 'harus', 'bs': 'bisa', 'bsa': 'bisa', 'isa': 'bisa', 'biar': 'agar', 'spy': 'agar', 'supaya': 'agar', 'kalo': 'kalau', 'klo': 'kalau', 'jd': 'jadi', 'jadi': 'menjadi', 'jdnya': 'jadinya', 'jadinya': 'akhirnya', 'jg': 'juga', 'jga': 'juga', 'jgn': 'jangan', 'jngn': 'jangan', 'jgn2': 'jangan-jangan', 'aj': 'saja', 'aja': 'saja', 'doang': 'saja', 'aje': 'saja', 'cm': 'cuma', 'cuma': 'hanya', 'cman': 'hanya', 'ckp': 'cukup', 'cukup': 'cukup', 'krg': 'kurang', 'kurang': 'kurang', 'emg': 'memang', 'emang': 'memang', 'mmg': 'memang', 'sbnrnya': 'sebenarnya', 'sbenernya': 'sebenarnya', 'pdhl': 'padahal', 'pdahal': 'padahal', 'wlpn': 'walaupun', 'walaupun': 'walaupun', 'meskipun': 'walaupun', 'walau': 'walaupun', 'aplg': 'apalagi', 'apalagi': 'apalagi', 'mgkn': 'mungkin', 'mungkin': 'mungkin', 'mgkin': 'mungkin', 'kyknya': 'kayaknya', 'kyaknya': 'kayaknya', 'kayaknya': 'sepertinya', 'kyk': 'seperti', 'kayak': 'seperti', 'ky': 'seperti', 'sprt': 'seperti', 'kaya': 'seperti', 'sy': 'saya', 'gw': 'saya', 'gue': 'saya', 'gua': 'saya', 'w': 'saya', 'aku': 'saya', 'ak': 'saya', 'km': 'kamu', 'kmu': 'kamu', 'lu': 'kamu', 'lo': 'kamu', 'elu': 'kamu', 'elo': 'kamu', 'u': 'kamu', 'dy': 'dia', 'dia': 'dia', 'mrk': 'mereka', 'mreka': 'mereka', 'tololl': 'bodoh', 'tolol': 'bodoh', 'qt': 'kita', 'qta': 'kita', 'seneng': 'senang', 'suka': 'suka', 'sk': 'suka', 'kesel': 'kesal', 'binun': 'bingung', 'males': 'malas', 'capek': 'capek', 'cape': 'capek', 'lelah': 'lelah', 'tired': 'lelah', 'stress': 'stres', 'mantul': 'mantap', 'keren': 'keren', 'gokil': 'keren', 'ajib': 'keren', 'top': 'bagus', 'the best': 'terbaik', 'terbaik': 'terbaik', 'terburuk': 'terburuk', 'worst': 'terburuk', 'best': 'terbaik', 'good': 'bagus', 'bad': 'buruk', 'nice': 'bagus', 'awesome': 'keren', 'amazing': 'menakjubkan', 'terrible': 'buruk', 'horrible': 'mengerikan', 'excellent': 'sangat bagus', 'perfect': 'sempurna', 'ok': 'baik', 'oke': 'baik', 'okay': 'baik', 'fine': 'baik', 'standard': 'standar', 'ajg':'anjing', 'anjg':'anjing', 'tw':'tau', 'kek':'seperti' } def text_cleaning(self, text): if pd.isna(text): return "" text = str(text) text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) text = re.sub(r'@\w+|#\w+', '', text) text = re.sub(r'\S+@\S+', '', text) text = re.sub(r'\b\d+\b', '', text) text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'(.)\1{2,}', r'\1\1', text) text = re.sub(r'\s+', ' ', text).strip() return text def case_folding(self, text): return text.lower() def tokenizing(self, text): tokens = text.split() tokens = [token for token in tokens if len(token) > 1 and token.isalpha()] return tokens def remove_stopwords(self, tokens): return [word for word in tokens if word not in self.stop_words and len(word) > 2] def normalization(self, tokens): normalized_tokens = [] for token in tokens: if token in self.normalization_dict: normalized_tokens.append(self.normalization_dict[token]) else: normalized_tokens.append(token) return normalized_tokens def stemming(self, tokens): text = ' '.join(tokens) stemmed_text = self.stemmer.stem(text) return stemmed_text.split() def preprocess_text(self, text, show_steps=False): """ Preprocessing dengan urutan: 1. Cleaning 2. Case Folding 3. Tokenizing 4. Normalization 5. Stopwords Removal 6. Stemming """ steps = {} # Step 1: Cleaning cleaned = self.text_cleaning(text) if show_steps: steps['cleaned'] = cleaned # Step 2: Case Folding casefolded = self.case_folding(cleaned) if show_steps: steps['casefolded'] = casefolded # Step 3: Tokenizing tokens = self.tokenizing(casefolded) if show_steps: steps['tokenized'] = tokens # Step 4: Normalization normalized = self.normalization(tokens) if show_steps: steps['normalized'] = normalized # Step 5: Remove Stopwords no_stopwords = self.remove_stopwords(normalized) if show_steps: steps['no_stopwords'] = no_stopwords # Step 6: Stemming stemmed = self.stemming(no_stopwords) if show_steps: steps['stemmed'] = stemmed final_text = ' '.join(stemmed) if show_steps: steps['original'] = text steps['final'] = final_text return final_text, steps return final_text def load_and_preprocess_data(self, filepath): print(f"Loading dataset from {filepath}...") try: df = pd.read_csv(filepath, encoding='utf-8') except UnicodeDecodeError: print("⚠ UTF-8 gagal, mencoba encoding latin-1...") df = pd.read_csv(filepath, encoding='latin-1') print("Preprocessing texts...") df['processed_text'] = df['text'].apply(lambda x: self.preprocess_text(x)) df = df[df['processed_text'].str.len() > 0] df['sentiment'] = df['sentiment'].astype(int) print("Preprocessing complete.") return df def print_confusion_matrix(self, y_test, y_pred, title="Confusion Matrix"): """ Menampilkan confusion matrix dengan format yang jelas """ cm = confusion_matrix(y_test, y_pred, labels=[0, 1]) tn, fp, fn, tp = cm.ravel() print(f"\n{title}") print("="*60) print(f"\nDetail Metrik dari Confusion Matrix:") print(f" * True Positive (TP) : {tp:<5} (Prediksi: Positif, Aktual: Positif)") print(f" * True Negative (TN) : {tn:<5} (Prediksi: Negatif, Aktual: Negatif)") print(f" * False Positive (FP) : {fp:<5} (Prediksi: Positif, Aktual: Negatif) -> Error Tipe I") print(f" * False Negative (FN) : {fn:<5} (Prediksi: Negatif, Aktual: Positif) -> Error Tipe II") print("\nMatriks Konfusi (Visual):") print(" Prediksi Negatif | Prediksi Positif") print("---------------------------------------------------------") print(f"Aktual Negatif (0) | {tn:<10} | {fp:<10}") print(f"Aktual Positif (1) | {fn:<10} | {tp:<10}") print("---------------------------------------------------------") def train_and_evaluate_model(self, df): """ Training model dan evaluasi performa """ print("\n" + "="*60) print("TRAINING MODEL DENGAN DATA ORIGINAL") print("="*60) X = df['processed_text'] y = df['sentiment'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # TF-IDF Vectorization X_train_tfidf = self.vectorizer.fit_transform(X_train) X_test_tfidf = self.vectorizer.transform(X_test) # Train SVM print("Training SVM model...") self.model.fit(X_train_tfidf, y_train) print("✓ Training selesai!") # Evaluasi Model y_pred = self.model.predict(X_test_tfidf) accuracy = accuracy_score(y_test, y_pred) print(f"\nModel Accuracy: {accuracy:.4f}") self.print_confusion_matrix(y_test, y_pred, "Confusion Matrix") return accuracy def predict_sentiment(self, text): processed_text = self.preprocess_text(text) if not processed_text.strip(): return { 'sentiment': 'Tidak dapat menentukan', 'confidence': 0.0, 'probability_negative': 0.5, 'probability_positive': 0.5 } text_tfidf = self.vectorizer.transform([processed_text]) prediction = self.model.predict(text_tfidf)[0] probability = self.model.predict_proba(text_tfidf)[0] sentiment_label = "Positif" if prediction == 1 else "Negatif" confidence = max(probability) return { 'sentiment': sentiment_label, 'confidence': confidence, 'probability_negative': probability[0], 'probability_positive': probability[1] } def save_model(self, filepath='sentiment_model.pkl'): model_data = { 'model': self.model, 'vectorizer': self.vectorizer, 'stemmer': self.stemmer, 'stop_words': self.stop_words, 'normalization_dict': self.normalization_dict } with open(filepath, 'wb') as f: pickle.dump(model_data, f) print(f"\n✓ Model saved to {filepath}") def load_model(self, filepath='sentiment_model.pkl'): with open(filepath, 'rb') as f: model_data = pickle.load(f) self.model = model_data['model'] self.vectorizer = model_data['vectorizer'] self.stemmer = model_data['stemmer'] self.stop_words = model_data['stop_words'] self.normalization_dict = model_data['normalization_dict'] print(f"✓ Model loaded from {filepath}") def main(): print("\n" + "="*60) print("SENTIMENT ANALYSIS - 5-FOLD CROSS VALIDATION") print("="*60) analyzer = SentimentAnalyzer() # 1. Load Data df = analyzer.load_and_preprocess_data('data_mbg_labelled.csv') # Save processed data for dashboard print("\nSaving processed data to mbg_processed.csv...") df.to_csv('mbg_processed.csv', index=False, encoding='utf-8') print("✓ Processed data saved successfully!") # ── GRID SEARCH: Cari C & gamma terbaik (dilakukan sekali sebelum k-fold) ── print("\n" + "="*60) print(" GRID SEARCH - PENCARIAN C & GAMMA TERBAIK") print("="*60) X_all = df['processed_text'] y_all = df['sentiment'] # Split sementara 80:20 khusus untuk grid search X_gs_train, X_gs_test, y_gs_train, y_gs_test = train_test_split( X_all, y_all, test_size=0.2, random_state=42, stratify=y_all ) # TF-IDF untuk grid search from sklearn.pipeline import Pipeline gs_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), lowercase=True, strip_accents='unicode') X_gs_tfidf = gs_vectorizer.fit_transform(X_gs_train) param_grid = { 'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1] } print(f"Parameter yang diuji:") print(f" C : {param_grid['C']}") print(f" gamma : {param_grid['gamma']}") print(f" CV : 3-fold (untuk efisiensi)") print(f" Total kombinasi: {len(param_grid['C']) * len(param_grid['gamma'])} kombinasi × 3 fold") print("\nProses grid search sedang berjalan, harap tunggu...") grid_search = GridSearchCV( SVC(kernel='rbf', probability=True), param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1 ) grid_search.fit(X_gs_tfidf, y_gs_train) best_C = grid_search.best_params_['C'] best_gamma = grid_search.best_params_['gamma'] best_cv_score = grid_search.best_score_ print(f"\n✓ Grid Search selesai!") print(f"{'─'*45}") print(f" Hasil Grid Search:") print(f" Best C : {best_C}") print(f" Best gamma : {best_gamma}") print(f" Best CV Score : {best_cv_score*100:.2f}% (rata-rata 3-fold)") print(f"{'─'*45}") # Tampilkan tabel semua kombinasi print(f"\n Ringkasan seluruh kombinasi:") print(f" {'C':<8} | {'gamma':<8} | {'CV Accuracy':>12}") print(f" {'-'*8}-+-{'-'*8}-+-{'-'*12}") gs_results = grid_search.cv_results_ for c_val, g_val, score in zip( gs_results['param_C'], gs_results['param_gamma'], gs_results['mean_test_score']): marker = " ◄ TERBAIK" if (c_val == best_C and g_val == best_gamma) else "" print(f" {str(c_val):<8} | {str(g_val):<8} | {score*100:>11.2f}%{marker}") print(f"{'─'*45}") # Terapkan parameter terbaik ke model analyzer analyzer.model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True) analyzer.vectorizer = gs_vectorizer print(f"\n✓ Model SVM diperbarui dengan C={best_C}, gamma={best_gamma}") # ── K-FOLD CROSS VALIDATION (5 Fold) ── print("\n\n" + "="*70) print(" K-FOLD CROSS VALIDATION (5 FOLD)") print("="*70) X = np.array(df['processed_text']) y = np.array(df['sentiment']) N_FOLDS = 5 skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42) fold_results = [] for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1): print("\n\n" + "#"*70) print(f" FOLD {fold_idx} / {N_FOLDS}") print("#"*70) X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] print(f"Jumlah Data Training : {len(X_train)}") print(f"Jumlah Data Testing : {len(X_test)}") # TF-IDF – fit HANYA pada data training fold ini fold_vectorizer = TfidfVectorizer( max_features=5000, ngram_range=(1, 2), lowercase=True, strip_accents='unicode' ) X_train_tfidf = fold_vectorizer.fit_transform(X_train) X_test_tfidf = fold_vectorizer.transform(X_test) # Training SVM fold_model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True) fold_model.fit(X_train_tfidf, y_train) # Prediksi y_pred = fold_model.predict(X_test_tfidf) # Hitung Metrik acc = accuracy_score(y_test, y_pred) report = classification_report( y_test, y_pred, target_names=['Negatif', 'Positif'], output_dict=True ) cm = confusion_matrix(y_test, y_pred) tn, fp, fn, tp = cm.ravel() total = int(tp + tn + fp + fn) # Tampilkan Confusion Matrix print(f"\nConfusion Matrix (Fold {fold_idx}):") print(f" {'':<22} | {'Pred Negatif':<14} | {'Pred Positif':<14}") print(" " + "-" * 57) print(f" {'Aktual Negatif (0)':<22} | {tn:<14} | {fp:<14}") print(f" {'Aktual Positif (1)':<22} | {fn:<14} | {tp:<14}") print(" " + "-" * 57) print(f" TP={tp}, TN={tn}, FP={fp}, FN={fn}") print(f" Total data uji = TP+TN+FP+FN = {tp}+{tn}+{fp}+{fn} = {total}") # Ambil nilai per kelas prec_neg = report['Negatif']['precision'] rec_neg = report['Negatif']['recall'] f1_neg = report['Negatif']['f1-score'] sup_neg = int(report['Negatif']['support']) prec_pos = report['Positif']['precision'] rec_pos = report['Positif']['recall'] f1_pos = report['Positif']['f1-score'] sup_pos = int(report['Positif']['support']) prec_w = report['weighted avg']['precision'] rec_w = report['weighted avg']['recall'] f1_w = report['weighted avg']['f1-score'] # ── PERHITUNGAN RUNTUT SETIAP METRIK ── print(f"\n{'─'*60}") print(f" PERHITUNGAN METRIK EVALUASI (Fold {fold_idx})") print(f"{'─'*60}") print(f"\n▶ PRECISION") print(f" Rumus : TP / (TP + FP) [per kelas]") print(f" Negatif: TN / (TN + FN) = {tn} / ({tn}+{fn}) = {tn}/{tn+fn} = {prec_neg*100:.2f}%") print(f" Positif: TP / (TP + FP) = {tp} / ({tp}+{fp}) = {tp}/{tp+fp} = {prec_pos*100:.2f}%") print(f" Weighted Avg = ({prec_neg:.6f}×{sup_neg} + {prec_pos:.6f}×{sup_pos}) / {total}") print(f" = {prec_neg*sup_neg:.4f} + {prec_pos*sup_pos:.4f} / {total}") print(f" = {prec_w*100:.2f}%") print(f"\n▶ RECALL") print(f" Rumus : TP / (TP + FN) [per kelas]") print(f" Negatif: TN / (TN + FP) = {tn} / ({tn}+{fp}) = {tn}/{tn+fp} = {rec_neg*100:.2f}%") print(f" Positif: TP / (TP + FN) = {tp} / ({tp}+{fn}) = {tp}/{tp+fn} = {rec_pos*100:.2f}%") print(f" Weighted Avg = ({rec_neg:.6f}×{sup_neg} + {rec_pos:.6f}×{sup_pos}) / {total}") print(f" = {rec_neg*sup_neg:.4f} + {rec_pos*sup_pos:.4f} / {total}") print(f" = {rec_w*100:.2f}%") print(f"\n▶ F1-SCORE") print(f" Rumus : 2 × (Precision × Recall) / (Precision + Recall) [per kelas]") print(f" Negatif: 2×({prec_neg:.6f}×{rec_neg:.6f}) / ({prec_neg:.6f}+{rec_neg:.6f})") print(f" = 2×{prec_neg*rec_neg:.6f} / {prec_neg+rec_neg:.6f} = {f1_neg*100:.2f}%") print(f" Positif: 2×({prec_pos:.6f}×{rec_pos:.6f}) / ({prec_pos:.6f}+{rec_pos:.6f})") print(f" = 2×{prec_pos*rec_pos:.6f} / {prec_pos+rec_pos:.6f} = {f1_pos*100:.2f}%") print(f" Weighted Avg = ({f1_neg:.6f}×{sup_neg} + {f1_pos:.6f}×{sup_pos}) / {total}") print(f" = {f1_neg*sup_neg:.4f} + {f1_pos*sup_pos:.4f} / {total}") print(f" = {f1_w*100:.2f}%") print(f"\n▶ ACCURACY") print(f" Rumus : (TP + TN) / (TP + TN + FP + FN)") print(f" = ({tp} + {tn}) / ({tp}+{tn}+{fp}+{fn})") print(f" = {tp+tn} / {total}") print(f" = {acc*100:.2f}%") print(f"\n{'─'*60}") print(f" RINGKASAN HASIL AKHIR (Fold {fold_idx})") print(f"{'─'*60}") print(f" {'Kelas':<12} | {'Precision':>12} | {'Recall':>12} | {'F1-Score':>12} | {'Support':>8}") print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}") print(f" {'Negatif':<12} | {prec_neg*100:>11.2f}% | {rec_neg*100:>11.2f}% | {f1_neg*100:>11.2f}% | {sup_neg:>8}") print(f" {'Positif':<12} | {prec_pos*100:>11.2f}% | {rec_pos*100:>11.2f}% | {f1_pos*100:>11.2f}% | {sup_pos:>8}") print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}") print(f" {'Weighted Avg':<12} | {prec_w*100:>11.2f}% | {rec_w*100:>11.2f}% | {f1_w*100:>11.2f}% | {total:>8}") print(f"\n Accuracy: {acc*100:.2f}%") print(f"{'─'*60}") fold_results.append({ 'Fold' : fold_idx, 'Accuracy' : acc, 'Precision': prec_w, 'Recall' : rec_w, 'F1-Score' : f1_w, 'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn, }) # ── RINGKASAN SEMUA FOLD ── print("\n\n" + "="*70) print(" RINGKASAN PERFORMA SEMUA FOLD K-FOLD CROSS VALIDATION") print("="*70) print(f" {'Fold':<6} | {'Accuracy':>10} | {'Precision':>10} | {'Recall':>10} | {'F1-Score':>10}") print(f" {'-'*6}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}") for r in fold_results: print(f" {r['Fold']:<6} | {r['Accuracy']*100:>9.2f}% | {r['Precision']*100:>9.2f}% | {r['Recall']*100:>9.2f}% | {r['F1-Score']*100:>9.2f}%") print(f" {'-'*6}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}") avg_acc = np.mean([r['Accuracy'] for r in fold_results]) avg_prec = np.mean([r['Precision'] for r in fold_results]) avg_rec = np.mean([r['Recall'] for r in fold_results]) avg_f1 = np.mean([r['F1-Score'] for r in fold_results]) std_acc = np.std([r['Accuracy'] for r in fold_results]) print(f" {'Rata-rata':<6} | {avg_acc*100:>9.2f}% | {avg_prec*100:>9.2f}% | {avg_rec*100:>9.2f}% | {avg_f1*100:>9.2f}%") print(f" {'Std Dev':<6} | {std_acc*100:>9.2f}% | {'':>10} | {'':>10} | {'':>10}") print("="*70) # ── CONFUSION MATRIX GABUNGAN SEMUA FOLD ── agg_tp = sum(r['TP'] for r in fold_results) agg_tn = sum(r['TN'] for r in fold_results) agg_fp = sum(r['FP'] for r in fold_results) agg_fn = sum(r['FN'] for r in fold_results) agg_total = agg_tp + agg_tn + agg_fp + agg_fn # Support per kelas (total data aktual tiap kelas dari semua fold) agg_sup_neg = agg_tn + agg_fp # semua aktual Negatif agg_sup_pos = agg_tp + agg_fn # semua aktual Positif # Metrik per kelas dari CM gabungan agg_prec_neg = agg_tn / (agg_tn + agg_fn) if (agg_tn + agg_fn) > 0 else 0 agg_prec_pos = agg_tp / (agg_tp + agg_fp) if (agg_tp + agg_fp) > 0 else 0 agg_rec_neg = agg_tn / (agg_tn + agg_fp) if (agg_tn + agg_fp) > 0 else 0 agg_rec_pos = agg_tp / (agg_tp + agg_fn) if (agg_tp + agg_fn) > 0 else 0 agg_f1_neg = (2 * agg_prec_neg * agg_rec_neg / (agg_prec_neg + agg_rec_neg) if (agg_prec_neg + agg_rec_neg) > 0 else 0) agg_f1_pos = (2 * agg_prec_pos * agg_rec_pos / (agg_prec_pos + agg_rec_pos) if (agg_prec_pos + agg_rec_pos) > 0 else 0) # Weighted Average agg_prec_w = (agg_prec_neg * agg_sup_neg + agg_prec_pos * agg_sup_pos) / agg_total agg_rec_w = (agg_rec_neg * agg_sup_neg + agg_rec_pos * agg_sup_pos) / agg_total agg_f1_w = (agg_f1_neg * agg_sup_neg + agg_f1_pos * agg_sup_pos) / agg_total agg_acc = (agg_tp + agg_tn) / agg_total print("\n\n" + "="*70) print(" CONFUSION MATRIX GABUNGAN SEMUA FOLD (AGGREGATED)") print("="*70) print(f" (Merupakan jumlah TP, TN, FP, FN dari seluruh {N_FOLDS} fold)") print() print(f" {'':<22} | {'Pred Negatif':<14} | {'Pred Positif':<14}") print(" " + "-" * 57) print(f" {'Aktual Negatif (0)':<22} | {agg_tn:<14} | {agg_fp:<14}") print(f" {'Aktual Positif (1)':<22} | {agg_fn:<14} | {agg_tp:<14}") print(" " + "-" * 57) print(f" TP = {agg_tp}, TN = {agg_tn}, FP = {agg_fp}, FN = {agg_fn}") print(f" Total keseluruhan = TP+TN+FP+FN = {agg_tp}+{agg_tn}+{agg_fp}+{agg_fn} = {agg_total}") print() print(f" Asal nilai:") for r in fold_results: print(f" Fold {r['Fold']}: TP={r['TP']}, TN={r['TN']}, FP={r['FP']}, FN={r['FN']}") print(f" {'─'*50}") print(f" Total : TP={agg_tp}, TN={agg_tn}, FP={agg_fp}, FN={agg_fn}") print(f"\n{'─'*70}") print(f" PERHITUNGAN METRIK DARI CONFUSION MATRIX GABUNGAN") print(f"{'─'*70}") print(f"\n▶ PRECISION (per kelas dari CM gabungan)") print(f" Rumus : Prediksi benar kelas X / semua prediksi kelas X") print(f" Negatif: TN / (TN + FN) = {agg_tn} / ({agg_tn}+{agg_fn}) = {agg_tn}/{agg_tn+agg_fn} = {agg_prec_neg*100:.2f}%") print(f" Positif: TP / (TP + FP) = {agg_tp} / ({agg_tp}+{agg_fp}) = {agg_tp}/{agg_tp+agg_fp} = {agg_prec_pos*100:.2f}%") print(f" Weighted Avg:") print(f" = (Prec_Neg × Sup_Neg + Prec_Pos × Sup_Pos) / Total") print(f" = ({agg_prec_neg:.6f} × {agg_sup_neg} + {agg_prec_pos:.6f} × {agg_sup_pos}) / {agg_total}") print(f" = ({agg_prec_neg*agg_sup_neg:.4f} + {agg_prec_pos*agg_sup_pos:.4f}) / {agg_total}") print(f" = {agg_prec_neg*agg_sup_neg + agg_prec_pos*agg_sup_pos:.4f} / {agg_total}") print(f" = {agg_prec_w*100:.2f}%") print(f"\n▶ RECALL (per kelas dari CM gabungan)") print(f" Rumus : Prediksi benar kelas X / semua data aktual kelas X") print(f" Negatif: TN / (TN + FP) = {agg_tn} / ({agg_tn}+{agg_fp}) = {agg_tn}/{agg_tn+agg_fp} = {agg_rec_neg*100:.2f}%") print(f" Positif: TP / (TP + FN) = {agg_tp} / ({agg_tp}+{agg_fn}) = {agg_tp}/{agg_tp+agg_fn} = {agg_rec_pos*100:.2f}%") print(f" Weighted Avg:") print(f" = (Rec_Neg × Sup_Neg + Rec_Pos × Sup_Pos) / Total") print(f" = ({agg_rec_neg:.6f} × {agg_sup_neg} + {agg_rec_pos:.6f} × {agg_sup_pos}) / {agg_total}") print(f" = ({agg_rec_neg*agg_sup_neg:.4f} + {agg_rec_pos*agg_sup_pos:.4f}) / {agg_total}") print(f" = {agg_rec_neg*agg_sup_neg + agg_rec_pos*agg_sup_pos:.4f} / {agg_total}") print(f" = {agg_rec_w*100:.2f}%") print(f"\n▶ F1-SCORE (per kelas dari CM gabungan)") print(f" Rumus : 2 × (Precision × Recall) / (Precision + Recall) [per kelas]") print(f" Negatif:") print(f" = 2 × ({agg_prec_neg:.6f} × {agg_rec_neg:.6f}) / ({agg_prec_neg:.6f} + {agg_rec_neg:.6f})") print(f" = 2 × {agg_prec_neg*agg_rec_neg:.6f} / {agg_prec_neg+agg_rec_neg:.6f}") print(f" = {2*agg_prec_neg*agg_rec_neg:.6f} / {agg_prec_neg+agg_rec_neg:.6f}") print(f" = {agg_f1_neg*100:.2f}%") print(f" Positif:") print(f" = 2 × ({agg_prec_pos:.6f} × {agg_rec_pos:.6f}) / ({agg_prec_pos:.6f} + {agg_rec_pos:.6f})") print(f" = 2 × {agg_prec_pos*agg_rec_pos:.6f} / {agg_prec_pos+agg_rec_pos:.6f}") print(f" = {2*agg_prec_pos*agg_rec_pos:.6f} / {agg_prec_pos+agg_rec_pos:.6f}") print(f" = {agg_f1_pos*100:.2f}%") print(f" Weighted Avg:") print(f" = (F1_Neg × Sup_Neg + F1_Pos × Sup_Pos) / Total") print(f" = ({agg_f1_neg:.6f} × {agg_sup_neg} + {agg_f1_pos:.6f} × {agg_sup_pos}) / {agg_total}") print(f" = ({agg_f1_neg*agg_sup_neg:.4f} + {agg_f1_pos*agg_sup_pos:.4f}) / {agg_total}") print(f" = {agg_f1_neg*agg_sup_neg + agg_f1_pos*agg_sup_pos:.4f} / {agg_total}") print(f" = {agg_f1_w*100:.2f}%") print(f"\n▶ ACCURACY (dari CM gabungan)") print(f" Rumus : (TP + TN) / (TP + TN + FP + FN)") print(f" = ({agg_tp} + {agg_tn}) / ({agg_tp}+{agg_tn}+{agg_fp}+{agg_fn})") print(f" = {agg_tp+agg_tn} / {agg_total}") print(f" = {agg_acc*100:.2f}%") print(f"\n{'─'*70}") print(f" TABEL HASIL EVALUASI (CM Gabungan {N_FOLDS} Fold)") print(f"{'─'*70}") print(f" {'Kelas':<12} | {'Precision':>12} | {'Recall':>12} | {'F1-Score':>12} | {'Support':>8}") print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}") print(f" {'Negatif':<12} | {agg_prec_neg*100:>11.2f}% | {agg_rec_neg*100:>11.2f}% | {agg_f1_neg*100:>11.2f}% | {agg_sup_neg:>8}") print(f" {'Positif':<12} | {agg_prec_pos*100:>11.2f}% | {agg_rec_pos*100:>11.2f}% | {agg_f1_pos*100:>11.2f}% | {agg_sup_pos:>8}") print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}") print(f" {'Weighted Avg':<12} | {agg_prec_w*100:>11.2f}% | {agg_rec_w*100:>11.2f}% | {agg_f1_w*100:>11.2f}% | {agg_total:>8}") print(f"\n Accuracy (CM Gabungan): {agg_acc*100:.2f}%") print(f"{'='*70}") # ── PERHITUNGAN RATA-RATA K-FOLD SECARA RUNTUT ── print(f"\n{'─'*70}") print(f" PERHITUNGAN RATA-RATA METRIK K-FOLD (5 FOLD)") print(f"{'─'*70}") acc_vals = [r['Accuracy'] for r in fold_results] prec_vals = [r['Precision'] for r in fold_results] rec_vals = [r['Recall'] for r in fold_results] f1_vals = [r['F1-Score'] for r in fold_results] print(f"\n▶ RATA-RATA ACCURACY") print(f" Rumus : (Acc_F1 + Acc_F2 + ... + Acc_F5) / 5") print(f" = ({' + '.join([f'{v*100:.2f}%' for v in acc_vals])}) / {N_FOLDS}") print(f" = {sum(acc_vals)*100:.2f}% / {N_FOLDS}") print(f" = {avg_acc*100:.2f}%") print(f" Std Dev Accuracy = {std_acc*100:.2f}%") print(f"\n▶ RATA-RATA PRECISION (Weighted Avg)") print(f" Rumus : (Prec_F1 + Prec_F2 + ... + Prec_F5) / 5") print(f" = ({' + '.join([f'{v*100:.2f}%' for v in prec_vals])}) / {N_FOLDS}") print(f" = {sum(prec_vals)*100:.2f}% / {N_FOLDS}") print(f" = {avg_prec*100:.2f}%") print(f"\n▶ RATA-RATA RECALL (Weighted Avg)") print(f" Rumus : (Rec_F1 + Rec_F2 + ... + Rec_F5) / 5") print(f" = ({' + '.join([f'{v*100:.2f}%' for v in rec_vals])}) / {N_FOLDS}") print(f" = {sum(rec_vals)*100:.2f}% / {N_FOLDS}") print(f" = {avg_rec*100:.2f}%") print(f"\n▶ RATA-RATA F1-SCORE (Weighted Avg)") print(f" Rumus : (F1_F1 + F1_F2 + ... + F1_F5) / 5") print(f" = ({' + '.join([f'{v*100:.2f}%' for v in f1_vals])}) / {N_FOLDS}") print(f" = {sum(f1_vals)*100:.2f}% / {N_FOLDS}") print(f" = {avg_f1*100:.2f}%") print(f"\n{'─'*70}") print(f" TABEL RATA-RATA AKHIR ({N_FOLDS}-Fold Cross Validation)") print(f"{'─'*70}") print(f" {'Metrik':<20} | {'Nilai Rata-rata':>16} | {'Std Dev':>10}") print(f" {'-'*20}-+-{'-'*16}-+-{'-'*10}") print(f" {'Accuracy':<20} | {avg_acc*100:>15.2f}% | {std_acc*100:>9.2f}%") print(f" {'Precision (W.Avg)':<20} | {avg_prec*100:>15.2f}% | {'─':>10}") print(f" {'Recall (W.Avg)':<20} | {avg_rec*100:>15.2f}% | {'─':>10}") print(f" {'F1-Score (W.Avg)':<20} | {avg_f1*100:>15.2f}% | {'─':>10}") print(f"{'─'*70}") # ── FOLD TERBAIK ── best_fold_idx = max(range(N_FOLDS), key=lambda i: fold_results[i]['Accuracy']) best_fold = fold_results[best_fold_idx] print(f"\n Fold terbaik (accuracy tertinggi): Fold {best_fold['Fold']}") print(f" Accuracy : {best_fold['Accuracy']*100:.2f}%") print(f" Precision : {best_fold['Precision']*100:.2f}%") print(f" Recall : {best_fold['Recall']*100:.2f}%") print(f" F1-Score : {best_fold['F1-Score']*100:.2f}%") # ── TRAIN ULANG DENGAN SELURUH DATA (MODEL FINAL) ── print(f"\n{'='*70}") print(f" TRAINING MODEL FINAL DENGAN SELURUH DATA") print(f"{'='*70}") print(f" Parameter SVM: C={best_C}, gamma={best_gamma}") print(f" Jumlah total data : {len(X)}") X_all_arr = np.array(df['processed_text']) y_all_arr = np.array(df['sentiment']) final_vectorizer = TfidfVectorizer( max_features=5000, ngram_range=(1, 2), lowercase=True, strip_accents='unicode' ) X_final_tfidf = final_vectorizer.fit_transform(X_all_arr) final_model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True) final_model.fit(X_final_tfidf, y_all_arr) print("✓ Training model final selesai!") # Ganti model & vectorizer analyzer dengan yang final analyzer.model = final_model analyzer.vectorizer = final_vectorizer # ── NILAI ALPHA (Lagrange Multiplier) SVM ── dual_coefs = analyzer.model.dual_coef_ alpha_values = np.abs(np.asarray(dual_coefs.todense())).flatten() n_sv = analyzer.model.support_vectors_.shape[0] avg_alpha = float(np.mean(alpha_values)) min_alpha = float(np.min(alpha_values)) max_alpha = float(np.max(alpha_values)) print(f"\n{'─'*55}") print(f" NILAI ALPHA (Lagrange Multiplier) MODEL SVM FINAL") print(f"{'─'*55}") print(f" Keterangan : α_i diperoleh dari |dual_coef_| model") print(f" Jumlah Support Vector : {n_sv}") print(f" Rata-rata α : {avg_alpha:.6f}") print(f" α Minimum : {min_alpha:.6f}") print(f" α Maksimum : {max_alpha:.6f}") print(f"{'─'*55}") print(f" ★ Gunakan nilai Rata-rata α = {avg_alpha:.6f}") print(f" sebagai given value untuk hitungan manual SVM di skripsi.") print(f"{'─'*55}") # ── SIMPAN METRIK KE JSON ── import json metrics = { 'k_fold': N_FOLDS, 'best_C': best_C, 'best_gamma': best_gamma, 'kfold_results': [ { 'fold' : r['Fold'], 'accuracy' : float(r['Accuracy']), 'precision': float(r['Precision']), 'recall' : float(r['Recall']), 'f1_score' : float(r['F1-Score']), 'TP': int(r['TP']), 'TN': int(r['TN']), 'FP': int(r['FP']), 'FN': int(r['FN']), } for r in fold_results ], 'average': { 'accuracy' : float(avg_acc), 'precision': float(avg_prec), 'recall' : float(avg_rec), 'f1_score' : float(avg_f1), 'std_accuracy': float(std_acc), }, 'best_fold': int(best_fold['Fold']), 'alpha': { 'average': avg_alpha, 'minimum': min_alpha, 'maximum': max_alpha, 'n_support_vectors': int(n_sv), } } with open('model_metrics.json', 'w') as f: json.dump(metrics, f, indent=4) print("✓ Model metrics saved to model_metrics.json") # Simpan model final analyzer.save_model('sentiment_model.pkl') print(f"✓ Model final berhasil disimpan ke sentiment_model.pkl!") if __name__ == "__main__": main()