analisis-sentimen-program-m.../sentiment_model.py

793 lines
35 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk
import warnings
import os
warnings.filterwarnings('ignore')
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))
# Download NLTK requirements
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
class SentimentAnalyzer:
def __init__(self):
# Inisialisasi stemmer bahasa Indonesia
factory = StemmerFactory()
self.stemmer = factory.create_stemmer()
# Inisialisasi stopwords bahasa Indonesia
stop_factory = StopWordRemoverFactory()
self.stop_words = set(stop_factory.get_stop_words())
# Tambahan stopwords khusus
additional_stopwords = {
'yg', 'dgn', 'nya', 'kalo', 'kalau', 'udah', 'udh', 'dah',
'lg', 'lagi', 'banget', 'bgt', 'emang', 'memang', 'sih',
'aja', 'doang', 'nih', 'nah', 'lah', 'deh', 'dong', 'kok',
'ya', 'yah', 'wkwk', 'haha', 'hihi', 'huhu', 'hehe'
}
self.stop_words.update(additional_stopwords)
# Hapus kata negasi dari stopwords agar tidak ikut dibuang
# (penting untuk menjaga makna kalimat seperti "tidak bagus", "bukan salah", dll.)
negation_words = {
'tidak', 'bukan', 'belum', 'jangan', 'tak', 'tanpa',
'kurang', 'jarang', 'hampir', 'nyaris'
}
self.stop_words -= negation_words
# TF-IDF Vectorizer
self.vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2),
lowercase=True,
strip_accents='unicode'
)
# SVM Model (Kernel RBF)
self.model = SVC(kernel='rbf', C=1.0, gamma=1, probability=True)
# Kamus normalisasi bahasa Indonesia
self.normalization_dict = {
'yg': 'yang', 'dgn': 'dengan', 'krn': 'karena', 'krna': 'karena',
'tp': 'tapi', 'tpi': 'tapi', 'gk': 'tidak', 'ga': 'tidak',
'gak': 'tidak', 'ngga': 'tidak', 'nggak': 'tidak', 'g': 'tidak',
'tdk': 'tidak', 'gitu': 'begitu', 'gt': 'begitu', 'gmn': 'bagaimana',
'gimana': 'bagaimana', 'dmn': 'dimana',
'kmn': 'kemana',
'knp': 'kenapa', 'knapa': 'kenapa', 'org': 'orang', 'orng': 'orang',
'tmn': 'teman', 'temen': 'teman', 'bgmn': 'bagaimana', 'bgt': 'banget',
'banget': 'sangat', 'bener': 'benar', 'bnr': 'benar', 'bnyk': 'banyak',
'bnyak': 'banyak', 'udh': 'sudah', 'udah': 'sudah', 'dah': 'sudah',
'telah': 'sudah', 'blm': 'belum', 'blom': 'belum', 'msh': 'masih',
'msih': 'masih', 'lg': 'lagi', 'lgi': 'lagi', 'skrg': 'sekarang',
'skrang': 'sekarang', 'skg': 'sekarang', 'nanti': 'nanti',
'ntar': 'nanti', 'tar': 'nanti', 'bsk': 'besok', 'besok': 'besok',
'kmrn': 'kemarin', 'kmarin': 'kemarin', 'hrs': 'harus',
'kudu': 'harus', 'mesti': 'harus', 'bs': 'bisa', 'bsa': 'bisa',
'isa': 'bisa', 'biar': 'agar', 'spy': 'agar', 'supaya': 'agar',
'kalo': 'kalau', 'klo': 'kalau', 'jd': 'jadi', 'jadi': 'menjadi',
'jdnya': 'jadinya', 'jadinya': 'akhirnya', 'jg': 'juga', 'jga': 'juga',
'jgn': 'jangan', 'jngn': 'jangan', 'jgn2': 'jangan-jangan',
'aj': 'saja', 'aja': 'saja', 'doang': 'saja', 'aje': 'saja',
'cm': 'cuma', 'cuma': 'hanya', 'cman': 'hanya', 'ckp': 'cukup',
'cukup': 'cukup', 'krg': 'kurang', 'kurang': 'kurang', 'emg': 'memang',
'emang': 'memang', 'mmg': 'memang', 'sbnrnya': 'sebenarnya',
'sbenernya': 'sebenarnya', 'pdhl': 'padahal', 'pdahal': 'padahal',
'wlpn': 'walaupun', 'walaupun': 'walaupun', 'meskipun': 'walaupun',
'walau': 'walaupun', 'aplg': 'apalagi', 'apalagi': 'apalagi',
'mgkn': 'mungkin', 'mungkin': 'mungkin', 'mgkin': 'mungkin',
'kyknya': 'kayaknya', 'kyaknya': 'kayaknya', 'kayaknya': 'sepertinya',
'kyk': 'seperti', 'kayak': 'seperti', 'ky': 'seperti', 'sprt': 'seperti',
'kaya': 'seperti', 'sy': 'saya', 'gw': 'saya', 'gue': 'saya',
'gua': 'saya', 'w': 'saya', 'aku': 'saya', 'ak': 'saya', 'km': 'kamu',
'kmu': 'kamu', 'lu': 'kamu', 'lo': 'kamu', 'elu': 'kamu', 'elo': 'kamu',
'u': 'kamu', 'dy': 'dia', 'dia': 'dia', 'mrk': 'mereka',
'mreka': 'mereka', 'tololl': 'bodoh', 'tolol': 'bodoh',
'qt': 'kita', 'qta': 'kita', 'seneng': 'senang', 'suka': 'suka',
'sk': 'suka', 'kesel': 'kesal', 'binun': 'bingung', 'males': 'malas',
'capek': 'capek', 'cape': 'capek', 'lelah': 'lelah', 'tired': 'lelah',
'stress': 'stres', 'mantul': 'mantap', 'keren': 'keren', 'gokil': 'keren',
'ajib': 'keren', 'top': 'bagus', 'the best': 'terbaik',
'terbaik': 'terbaik', 'terburuk': 'terburuk', 'worst': 'terburuk',
'best': 'terbaik', 'good': 'bagus', 'bad': 'buruk', 'nice': 'bagus',
'awesome': 'keren', 'amazing': 'menakjubkan', 'terrible': 'buruk',
'horrible': 'mengerikan', 'excellent': 'sangat bagus', 'perfect': 'sempurna',
'ok': 'baik', 'oke': 'baik', 'okay': 'baik', 'fine': 'baik', 'standard': 'standar',
'ajg':'anjing', 'anjg':'anjing', 'tw':'tau', 'kek':'seperti'
}
def text_cleaning(self, text):
if pd.isna(text):
return ""
text = str(text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'@\w+|#\w+', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'\b\d+\b', '', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'(.)\1{2,}', r'\1\1', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def case_folding(self, text):
return text.lower()
def tokenizing(self, text):
tokens = text.split()
tokens = [token for token in tokens if len(token) > 1 and token.isalpha()]
return tokens
def remove_stopwords(self, tokens):
return [word for word in tokens if word not in self.stop_words and len(word) > 2]
def normalization(self, tokens):
normalized_tokens = []
for token in tokens:
if token in self.normalization_dict:
normalized_tokens.append(self.normalization_dict[token])
else:
normalized_tokens.append(token)
return normalized_tokens
def stemming(self, tokens):
text = ' '.join(tokens)
stemmed_text = self.stemmer.stem(text)
return stemmed_text.split()
def preprocess_text(self, text, show_steps=False):
"""
Preprocessing dengan urutan:
1. Cleaning
2. Case Folding
3. Tokenizing
4. Normalization
5. Stopwords Removal
6. Stemming
"""
steps = {}
# Step 1: Cleaning
cleaned = self.text_cleaning(text)
if show_steps: steps['cleaned'] = cleaned
# Step 2: Case Folding
casefolded = self.case_folding(cleaned)
if show_steps: steps['casefolded'] = casefolded
# Step 3: Tokenizing
tokens = self.tokenizing(casefolded)
if show_steps: steps['tokenized'] = tokens
# Step 4: Normalization
normalized = self.normalization(tokens)
if show_steps: steps['normalized'] = normalized
# Step 5: Remove Stopwords
no_stopwords = self.remove_stopwords(normalized)
if show_steps: steps['no_stopwords'] = no_stopwords
# Step 6: Stemming
stemmed = self.stemming(no_stopwords)
if show_steps: steps['stemmed'] = stemmed
final_text = ' '.join(stemmed)
if show_steps:
steps['original'] = text
steps['final'] = final_text
return final_text, steps
return final_text
def load_and_preprocess_data(self, filepath):
print(f"Loading dataset from {filepath}...")
try:
df = pd.read_csv(filepath, encoding='utf-8')
except UnicodeDecodeError:
print("⚠ UTF-8 gagal, mencoba encoding latin-1...")
df = pd.read_csv(filepath, encoding='latin-1')
print("Preprocessing texts...")
df['processed_text'] = df['text'].apply(lambda x: self.preprocess_text(x))
df = df[df['processed_text'].str.len() > 0]
df['sentiment'] = df['sentiment'].astype(int)
print("Preprocessing complete.")
return df
def print_confusion_matrix(self, y_test, y_pred, title="Confusion Matrix"):
"""
Menampilkan confusion matrix dengan format yang jelas
"""
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()
print(f"\n{title}")
print("="*60)
print(f"\nDetail Metrik dari Confusion Matrix:")
print(f" * True Positive (TP) : {tp:<5} (Prediksi: Positif, Aktual: Positif)")
print(f" * True Negative (TN) : {tn:<5} (Prediksi: Negatif, Aktual: Negatif)")
print(f" * False Positive (FP) : {fp:<5} (Prediksi: Positif, Aktual: Negatif) -> Error Tipe I")
print(f" * False Negative (FN) : {fn:<5} (Prediksi: Negatif, Aktual: Positif) -> Error Tipe II")
print("\nMatriks Konfusi (Visual):")
print(" Prediksi Negatif | Prediksi Positif")
print("---------------------------------------------------------")
print(f"Aktual Negatif (0) | {tn:<10} | {fp:<10}")
print(f"Aktual Positif (1) | {fn:<10} | {tp:<10}")
print("---------------------------------------------------------")
def train_and_evaluate_model(self, df):
"""
Training model dan evaluasi performa
"""
print("\n" + "="*60)
print("TRAINING MODEL DENGAN DATA ORIGINAL")
print("="*60)
X = df['processed_text']
y = df['sentiment']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# TF-IDF Vectorization
X_train_tfidf = self.vectorizer.fit_transform(X_train)
X_test_tfidf = self.vectorizer.transform(X_test)
# Train SVM
print("Training SVM model...")
self.model.fit(X_train_tfidf, y_train)
print("✓ Training selesai!")
# Evaluasi Model
y_pred = self.model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")
self.print_confusion_matrix(y_test, y_pred, "Confusion Matrix")
return accuracy
def predict_sentiment(self, text):
processed_text = self.preprocess_text(text)
if not processed_text.strip():
return {
'sentiment': 'Tidak dapat menentukan',
'confidence': 0.0,
'probability_negative': 0.5,
'probability_positive': 0.5
}
text_tfidf = self.vectorizer.transform([processed_text])
prediction = self.model.predict(text_tfidf)[0]
probability = self.model.predict_proba(text_tfidf)[0]
sentiment_label = "Positif" if prediction == 1 else "Negatif"
confidence = max(probability)
return {
'sentiment': sentiment_label,
'confidence': confidence,
'probability_negative': probability[0],
'probability_positive': probability[1]
}
def save_model(self, filepath='sentiment_model.pkl'):
model_data = {
'model': self.model,
'vectorizer': self.vectorizer,
'stemmer': self.stemmer,
'stop_words': self.stop_words,
'normalization_dict': self.normalization_dict
}
with open(filepath, 'wb') as f:
pickle.dump(model_data, f)
print(f"\n✓ Model saved to {filepath}")
def load_model(self, filepath='sentiment_model.pkl'):
with open(filepath, 'rb') as f:
model_data = pickle.load(f)
self.model = model_data['model']
self.vectorizer = model_data['vectorizer']
self.stemmer = model_data['stemmer']
self.stop_words = model_data['stop_words']
self.normalization_dict = model_data['normalization_dict']
print(f"✓ Model loaded from {filepath}")
def main():
print("\n" + "="*60)
print("SENTIMENT ANALYSIS - 5-FOLD CROSS VALIDATION")
print("="*60)
analyzer = SentimentAnalyzer()
# 1. Load Data
df = analyzer.load_and_preprocess_data('data_mbg_labelled.csv')
# Save processed data for dashboard
print("\nSaving processed data to mbg_processed.csv...")
df.to_csv('mbg_processed.csv', index=False, encoding='utf-8')
print("✓ Processed data saved successfully!")
# ── GRID SEARCH: Cari C & gamma terbaik (dilakukan sekali sebelum k-fold) ──
print("\n" + "="*60)
print(" GRID SEARCH - PENCARIAN C & GAMMA TERBAIK")
print("="*60)
X_all = df['processed_text']
y_all = df['sentiment']
# Split sementara 80:20 khusus untuk grid search
X_gs_train, X_gs_test, y_gs_train, y_gs_test = train_test_split(
X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)
# TF-IDF untuk grid search
from sklearn.pipeline import Pipeline
gs_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2),
lowercase=True, strip_accents='unicode')
X_gs_tfidf = gs_vectorizer.fit_transform(X_gs_train)
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1]
}
print(f"Parameter yang diuji:")
print(f" C : {param_grid['C']}")
print(f" gamma : {param_grid['gamma']}")
print(f" CV : 3-fold (untuk efisiensi)")
print(f" Total kombinasi: {len(param_grid['C']) * len(param_grid['gamma'])} kombinasi × 3 fold")
print("\nProses grid search sedang berjalan, harap tunggu...")
grid_search = GridSearchCV(
SVC(kernel='rbf', probability=True),
param_grid,
cv=3,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_gs_tfidf, y_gs_train)
best_C = grid_search.best_params_['C']
best_gamma = grid_search.best_params_['gamma']
best_cv_score = grid_search.best_score_
print(f"\n✓ Grid Search selesai!")
print(f"{''*45}")
print(f" Hasil Grid Search:")
print(f" Best C : {best_C}")
print(f" Best gamma : {best_gamma}")
print(f" Best CV Score : {best_cv_score*100:.2f}% (rata-rata 3-fold)")
print(f"{''*45}")
# Tampilkan tabel semua kombinasi
print(f"\n Ringkasan seluruh kombinasi:")
print(f" {'C':<8} | {'gamma':<8} | {'CV Accuracy':>12}")
print(f" {'-'*8}-+-{'-'*8}-+-{'-'*12}")
gs_results = grid_search.cv_results_
for c_val, g_val, score in zip(
gs_results['param_C'], gs_results['param_gamma'], gs_results['mean_test_score']):
marker = " ◄ TERBAIK" if (c_val == best_C and g_val == best_gamma) else ""
print(f" {str(c_val):<8} | {str(g_val):<8} | {score*100:>11.2f}%{marker}")
print(f"{''*45}")
# Terapkan parameter terbaik ke model analyzer
analyzer.model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True)
analyzer.vectorizer = gs_vectorizer
print(f"\n✓ Model SVM diperbarui dengan C={best_C}, gamma={best_gamma}")
# ── K-FOLD CROSS VALIDATION (5 Fold) ──
print("\n\n" + "="*70)
print(" K-FOLD CROSS VALIDATION (5 FOLD)")
print("="*70)
X = np.array(df['processed_text'])
y = np.array(df['sentiment'])
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
fold_results = []
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
print("\n\n" + "#"*70)
print(f" FOLD {fold_idx} / {N_FOLDS}")
print("#"*70)
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
print(f"Jumlah Data Training : {len(X_train)}")
print(f"Jumlah Data Testing : {len(X_test)}")
# TF-IDF fit HANYA pada data training fold ini
fold_vectorizer = TfidfVectorizer(
max_features=5000, ngram_range=(1, 2),
lowercase=True, strip_accents='unicode'
)
X_train_tfidf = fold_vectorizer.fit_transform(X_train)
X_test_tfidf = fold_vectorizer.transform(X_test)
# Training SVM
fold_model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True)
fold_model.fit(X_train_tfidf, y_train)
# Prediksi
y_pred = fold_model.predict(X_test_tfidf)
# Hitung Metrik
acc = accuracy_score(y_test, y_pred)
report = classification_report(
y_test, y_pred,
target_names=['Negatif', 'Positif'],
output_dict=True
)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
total = int(tp + tn + fp + fn)
# Tampilkan Confusion Matrix
print(f"\nConfusion Matrix (Fold {fold_idx}):")
print(f" {'':<22} | {'Pred Negatif':<14} | {'Pred Positif':<14}")
print(" " + "-" * 57)
print(f" {'Aktual Negatif (0)':<22} | {tn:<14} | {fp:<14}")
print(f" {'Aktual Positif (1)':<22} | {fn:<14} | {tp:<14}")
print(" " + "-" * 57)
print(f" TP={tp}, TN={tn}, FP={fp}, FN={fn}")
print(f" Total data uji = TP+TN+FP+FN = {tp}+{tn}+{fp}+{fn} = {total}")
# Ambil nilai per kelas
prec_neg = report['Negatif']['precision']
rec_neg = report['Negatif']['recall']
f1_neg = report['Negatif']['f1-score']
sup_neg = int(report['Negatif']['support'])
prec_pos = report['Positif']['precision']
rec_pos = report['Positif']['recall']
f1_pos = report['Positif']['f1-score']
sup_pos = int(report['Positif']['support'])
prec_w = report['weighted avg']['precision']
rec_w = report['weighted avg']['recall']
f1_w = report['weighted avg']['f1-score']
# ── PERHITUNGAN RUNTUT SETIAP METRIK ──
print(f"\n{''*60}")
print(f" PERHITUNGAN METRIK EVALUASI (Fold {fold_idx})")
print(f"{''*60}")
print(f"\n▶ PRECISION")
print(f" Rumus : TP / (TP + FP) [per kelas]")
print(f" Negatif: TN / (TN + FN) = {tn} / ({tn}+{fn}) = {tn}/{tn+fn} = {prec_neg*100:.2f}%")
print(f" Positif: TP / (TP + FP) = {tp} / ({tp}+{fp}) = {tp}/{tp+fp} = {prec_pos*100:.2f}%")
print(f" Weighted Avg = ({prec_neg:.6f}×{sup_neg} + {prec_pos:.6f}×{sup_pos}) / {total}")
print(f" = {prec_neg*sup_neg:.4f} + {prec_pos*sup_pos:.4f} / {total}")
print(f" = {prec_w*100:.2f}%")
print(f"\n▶ RECALL")
print(f" Rumus : TP / (TP + FN) [per kelas]")
print(f" Negatif: TN / (TN + FP) = {tn} / ({tn}+{fp}) = {tn}/{tn+fp} = {rec_neg*100:.2f}%")
print(f" Positif: TP / (TP + FN) = {tp} / ({tp}+{fn}) = {tp}/{tp+fn} = {rec_pos*100:.2f}%")
print(f" Weighted Avg = ({rec_neg:.6f}×{sup_neg} + {rec_pos:.6f}×{sup_pos}) / {total}")
print(f" = {rec_neg*sup_neg:.4f} + {rec_pos*sup_pos:.4f} / {total}")
print(f" = {rec_w*100:.2f}%")
print(f"\n▶ F1-SCORE")
print(f" Rumus : 2 × (Precision × Recall) / (Precision + Recall) [per kelas]")
print(f" Negatif: 2×({prec_neg:.6f}×{rec_neg:.6f}) / ({prec_neg:.6f}+{rec_neg:.6f})")
print(f" = 2×{prec_neg*rec_neg:.6f} / {prec_neg+rec_neg:.6f} = {f1_neg*100:.2f}%")
print(f" Positif: 2×({prec_pos:.6f}×{rec_pos:.6f}) / ({prec_pos:.6f}+{rec_pos:.6f})")
print(f" = 2×{prec_pos*rec_pos:.6f} / {prec_pos+rec_pos:.6f} = {f1_pos*100:.2f}%")
print(f" Weighted Avg = ({f1_neg:.6f}×{sup_neg} + {f1_pos:.6f}×{sup_pos}) / {total}")
print(f" = {f1_neg*sup_neg:.4f} + {f1_pos*sup_pos:.4f} / {total}")
print(f" = {f1_w*100:.2f}%")
print(f"\n▶ ACCURACY")
print(f" Rumus : (TP + TN) / (TP + TN + FP + FN)")
print(f" = ({tp} + {tn}) / ({tp}+{tn}+{fp}+{fn})")
print(f" = {tp+tn} / {total}")
print(f" = {acc*100:.2f}%")
print(f"\n{''*60}")
print(f" RINGKASAN HASIL AKHIR (Fold {fold_idx})")
print(f"{''*60}")
print(f" {'Kelas':<12} | {'Precision':>12} | {'Recall':>12} | {'F1-Score':>12} | {'Support':>8}")
print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}")
print(f" {'Negatif':<12} | {prec_neg*100:>11.2f}% | {rec_neg*100:>11.2f}% | {f1_neg*100:>11.2f}% | {sup_neg:>8}")
print(f" {'Positif':<12} | {prec_pos*100:>11.2f}% | {rec_pos*100:>11.2f}% | {f1_pos*100:>11.2f}% | {sup_pos:>8}")
print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}")
print(f" {'Weighted Avg':<12} | {prec_w*100:>11.2f}% | {rec_w*100:>11.2f}% | {f1_w*100:>11.2f}% | {total:>8}")
print(f"\n Accuracy: {acc*100:.2f}%")
print(f"{''*60}")
fold_results.append({
'Fold' : fold_idx,
'Accuracy' : acc,
'Precision': prec_w,
'Recall' : rec_w,
'F1-Score' : f1_w,
'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
})
# ── RINGKASAN SEMUA FOLD ──
print("\n\n" + "="*70)
print(" RINGKASAN PERFORMA SEMUA FOLD K-FOLD CROSS VALIDATION")
print("="*70)
print(f" {'Fold':<6} | {'Accuracy':>10} | {'Precision':>10} | {'Recall':>10} | {'F1-Score':>10}")
print(f" {'-'*6}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}")
for r in fold_results:
print(f" {r['Fold']:<6} | {r['Accuracy']*100:>9.2f}% | {r['Precision']*100:>9.2f}% | {r['Recall']*100:>9.2f}% | {r['F1-Score']*100:>9.2f}%")
print(f" {'-'*6}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}-+-{'-'*10}")
avg_acc = np.mean([r['Accuracy'] for r in fold_results])
avg_prec = np.mean([r['Precision'] for r in fold_results])
avg_rec = np.mean([r['Recall'] for r in fold_results])
avg_f1 = np.mean([r['F1-Score'] for r in fold_results])
std_acc = np.std([r['Accuracy'] for r in fold_results])
print(f" {'Rata-rata':<6} | {avg_acc*100:>9.2f}% | {avg_prec*100:>9.2f}% | {avg_rec*100:>9.2f}% | {avg_f1*100:>9.2f}%")
print(f" {'Std Dev':<6} | {std_acc*100:>9.2f}% | {'':>10} | {'':>10} | {'':>10}")
print("="*70)
# ── CONFUSION MATRIX GABUNGAN SEMUA FOLD ──
agg_tp = sum(r['TP'] for r in fold_results)
agg_tn = sum(r['TN'] for r in fold_results)
agg_fp = sum(r['FP'] for r in fold_results)
agg_fn = sum(r['FN'] for r in fold_results)
agg_total = agg_tp + agg_tn + agg_fp + agg_fn
# Support per kelas (total data aktual tiap kelas dari semua fold)
agg_sup_neg = agg_tn + agg_fp # semua aktual Negatif
agg_sup_pos = agg_tp + agg_fn # semua aktual Positif
# Metrik per kelas dari CM gabungan
agg_prec_neg = agg_tn / (agg_tn + agg_fn) if (agg_tn + agg_fn) > 0 else 0
agg_prec_pos = agg_tp / (agg_tp + agg_fp) if (agg_tp + agg_fp) > 0 else 0
agg_rec_neg = agg_tn / (agg_tn + agg_fp) if (agg_tn + agg_fp) > 0 else 0
agg_rec_pos = agg_tp / (agg_tp + agg_fn) if (agg_tp + agg_fn) > 0 else 0
agg_f1_neg = (2 * agg_prec_neg * agg_rec_neg / (agg_prec_neg + agg_rec_neg)
if (agg_prec_neg + agg_rec_neg) > 0 else 0)
agg_f1_pos = (2 * agg_prec_pos * agg_rec_pos / (agg_prec_pos + agg_rec_pos)
if (agg_prec_pos + agg_rec_pos) > 0 else 0)
# Weighted Average
agg_prec_w = (agg_prec_neg * agg_sup_neg + agg_prec_pos * agg_sup_pos) / agg_total
agg_rec_w = (agg_rec_neg * agg_sup_neg + agg_rec_pos * agg_sup_pos) / agg_total
agg_f1_w = (agg_f1_neg * agg_sup_neg + agg_f1_pos * agg_sup_pos) / agg_total
agg_acc = (agg_tp + agg_tn) / agg_total
print("\n\n" + "="*70)
print(" CONFUSION MATRIX GABUNGAN SEMUA FOLD (AGGREGATED)")
print("="*70)
print(f" (Merupakan jumlah TP, TN, FP, FN dari seluruh {N_FOLDS} fold)")
print()
print(f" {'':<22} | {'Pred Negatif':<14} | {'Pred Positif':<14}")
print(" " + "-" * 57)
print(f" {'Aktual Negatif (0)':<22} | {agg_tn:<14} | {agg_fp:<14}")
print(f" {'Aktual Positif (1)':<22} | {agg_fn:<14} | {agg_tp:<14}")
print(" " + "-" * 57)
print(f" TP = {agg_tp}, TN = {agg_tn}, FP = {agg_fp}, FN = {agg_fn}")
print(f" Total keseluruhan = TP+TN+FP+FN = {agg_tp}+{agg_tn}+{agg_fp}+{agg_fn} = {agg_total}")
print()
print(f" Asal nilai:")
for r in fold_results:
print(f" Fold {r['Fold']}: TP={r['TP']}, TN={r['TN']}, FP={r['FP']}, FN={r['FN']}")
print(f" {''*50}")
print(f" Total : TP={agg_tp}, TN={agg_tn}, FP={agg_fp}, FN={agg_fn}")
print(f"\n{''*70}")
print(f" PERHITUNGAN METRIK DARI CONFUSION MATRIX GABUNGAN")
print(f"{''*70}")
print(f"\n▶ PRECISION (per kelas dari CM gabungan)")
print(f" Rumus : Prediksi benar kelas X / semua prediksi kelas X")
print(f" Negatif: TN / (TN + FN) = {agg_tn} / ({agg_tn}+{agg_fn}) = {agg_tn}/{agg_tn+agg_fn} = {agg_prec_neg*100:.2f}%")
print(f" Positif: TP / (TP + FP) = {agg_tp} / ({agg_tp}+{agg_fp}) = {agg_tp}/{agg_tp+agg_fp} = {agg_prec_pos*100:.2f}%")
print(f" Weighted Avg:")
print(f" = (Prec_Neg × Sup_Neg + Prec_Pos × Sup_Pos) / Total")
print(f" = ({agg_prec_neg:.6f} × {agg_sup_neg} + {agg_prec_pos:.6f} × {agg_sup_pos}) / {agg_total}")
print(f" = ({agg_prec_neg*agg_sup_neg:.4f} + {agg_prec_pos*agg_sup_pos:.4f}) / {agg_total}")
print(f" = {agg_prec_neg*agg_sup_neg + agg_prec_pos*agg_sup_pos:.4f} / {agg_total}")
print(f" = {agg_prec_w*100:.2f}%")
print(f"\n▶ RECALL (per kelas dari CM gabungan)")
print(f" Rumus : Prediksi benar kelas X / semua data aktual kelas X")
print(f" Negatif: TN / (TN + FP) = {agg_tn} / ({agg_tn}+{agg_fp}) = {agg_tn}/{agg_tn+agg_fp} = {agg_rec_neg*100:.2f}%")
print(f" Positif: TP / (TP + FN) = {agg_tp} / ({agg_tp}+{agg_fn}) = {agg_tp}/{agg_tp+agg_fn} = {agg_rec_pos*100:.2f}%")
print(f" Weighted Avg:")
print(f" = (Rec_Neg × Sup_Neg + Rec_Pos × Sup_Pos) / Total")
print(f" = ({agg_rec_neg:.6f} × {agg_sup_neg} + {agg_rec_pos:.6f} × {agg_sup_pos}) / {agg_total}")
print(f" = ({agg_rec_neg*agg_sup_neg:.4f} + {agg_rec_pos*agg_sup_pos:.4f}) / {agg_total}")
print(f" = {agg_rec_neg*agg_sup_neg + agg_rec_pos*agg_sup_pos:.4f} / {agg_total}")
print(f" = {agg_rec_w*100:.2f}%")
print(f"\n▶ F1-SCORE (per kelas dari CM gabungan)")
print(f" Rumus : 2 × (Precision × Recall) / (Precision + Recall) [per kelas]")
print(f" Negatif:")
print(f" = 2 × ({agg_prec_neg:.6f} × {agg_rec_neg:.6f}) / ({agg_prec_neg:.6f} + {agg_rec_neg:.6f})")
print(f" = 2 × {agg_prec_neg*agg_rec_neg:.6f} / {agg_prec_neg+agg_rec_neg:.6f}")
print(f" = {2*agg_prec_neg*agg_rec_neg:.6f} / {agg_prec_neg+agg_rec_neg:.6f}")
print(f" = {agg_f1_neg*100:.2f}%")
print(f" Positif:")
print(f" = 2 × ({agg_prec_pos:.6f} × {agg_rec_pos:.6f}) / ({agg_prec_pos:.6f} + {agg_rec_pos:.6f})")
print(f" = 2 × {agg_prec_pos*agg_rec_pos:.6f} / {agg_prec_pos+agg_rec_pos:.6f}")
print(f" = {2*agg_prec_pos*agg_rec_pos:.6f} / {agg_prec_pos+agg_rec_pos:.6f}")
print(f" = {agg_f1_pos*100:.2f}%")
print(f" Weighted Avg:")
print(f" = (F1_Neg × Sup_Neg + F1_Pos × Sup_Pos) / Total")
print(f" = ({agg_f1_neg:.6f} × {agg_sup_neg} + {agg_f1_pos:.6f} × {agg_sup_pos}) / {agg_total}")
print(f" = ({agg_f1_neg*agg_sup_neg:.4f} + {agg_f1_pos*agg_sup_pos:.4f}) / {agg_total}")
print(f" = {agg_f1_neg*agg_sup_neg + agg_f1_pos*agg_sup_pos:.4f} / {agg_total}")
print(f" = {agg_f1_w*100:.2f}%")
print(f"\n▶ ACCURACY (dari CM gabungan)")
print(f" Rumus : (TP + TN) / (TP + TN + FP + FN)")
print(f" = ({agg_tp} + {agg_tn}) / ({agg_tp}+{agg_tn}+{agg_fp}+{agg_fn})")
print(f" = {agg_tp+agg_tn} / {agg_total}")
print(f" = {agg_acc*100:.2f}%")
print(f"\n{''*70}")
print(f" TABEL HASIL EVALUASI (CM Gabungan {N_FOLDS} Fold)")
print(f"{''*70}")
print(f" {'Kelas':<12} | {'Precision':>12} | {'Recall':>12} | {'F1-Score':>12} | {'Support':>8}")
print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}")
print(f" {'Negatif':<12} | {agg_prec_neg*100:>11.2f}% | {agg_rec_neg*100:>11.2f}% | {agg_f1_neg*100:>11.2f}% | {agg_sup_neg:>8}")
print(f" {'Positif':<12} | {agg_prec_pos*100:>11.2f}% | {agg_rec_pos*100:>11.2f}% | {agg_f1_pos*100:>11.2f}% | {agg_sup_pos:>8}")
print(f" {'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*12}-+-{'-'*8}")
print(f" {'Weighted Avg':<12} | {agg_prec_w*100:>11.2f}% | {agg_rec_w*100:>11.2f}% | {agg_f1_w*100:>11.2f}% | {agg_total:>8}")
print(f"\n Accuracy (CM Gabungan): {agg_acc*100:.2f}%")
print(f"{'='*70}")
# ── PERHITUNGAN RATA-RATA K-FOLD SECARA RUNTUT ──
print(f"\n{''*70}")
print(f" PERHITUNGAN RATA-RATA METRIK K-FOLD (5 FOLD)")
print(f"{''*70}")
acc_vals = [r['Accuracy'] for r in fold_results]
prec_vals = [r['Precision'] for r in fold_results]
rec_vals = [r['Recall'] for r in fold_results]
f1_vals = [r['F1-Score'] for r in fold_results]
print(f"\n▶ RATA-RATA ACCURACY")
print(f" Rumus : (Acc_F1 + Acc_F2 + ... + Acc_F5) / 5")
print(f" = ({' + '.join([f'{v*100:.2f}%' for v in acc_vals])}) / {N_FOLDS}")
print(f" = {sum(acc_vals)*100:.2f}% / {N_FOLDS}")
print(f" = {avg_acc*100:.2f}%")
print(f" Std Dev Accuracy = {std_acc*100:.2f}%")
print(f"\n▶ RATA-RATA PRECISION (Weighted Avg)")
print(f" Rumus : (Prec_F1 + Prec_F2 + ... + Prec_F5) / 5")
print(f" = ({' + '.join([f'{v*100:.2f}%' for v in prec_vals])}) / {N_FOLDS}")
print(f" = {sum(prec_vals)*100:.2f}% / {N_FOLDS}")
print(f" = {avg_prec*100:.2f}%")
print(f"\n▶ RATA-RATA RECALL (Weighted Avg)")
print(f" Rumus : (Rec_F1 + Rec_F2 + ... + Rec_F5) / 5")
print(f" = ({' + '.join([f'{v*100:.2f}%' for v in rec_vals])}) / {N_FOLDS}")
print(f" = {sum(rec_vals)*100:.2f}% / {N_FOLDS}")
print(f" = {avg_rec*100:.2f}%")
print(f"\n▶ RATA-RATA F1-SCORE (Weighted Avg)")
print(f" Rumus : (F1_F1 + F1_F2 + ... + F1_F5) / 5")
print(f" = ({' + '.join([f'{v*100:.2f}%' for v in f1_vals])}) / {N_FOLDS}")
print(f" = {sum(f1_vals)*100:.2f}% / {N_FOLDS}")
print(f" = {avg_f1*100:.2f}%")
print(f"\n{''*70}")
print(f" TABEL RATA-RATA AKHIR ({N_FOLDS}-Fold Cross Validation)")
print(f"{''*70}")
print(f" {'Metrik':<20} | {'Nilai Rata-rata':>16} | {'Std Dev':>10}")
print(f" {'-'*20}-+-{'-'*16}-+-{'-'*10}")
print(f" {'Accuracy':<20} | {avg_acc*100:>15.2f}% | {std_acc*100:>9.2f}%")
print(f" {'Precision (W.Avg)':<20} | {avg_prec*100:>15.2f}% | {'':>10}")
print(f" {'Recall (W.Avg)':<20} | {avg_rec*100:>15.2f}% | {'':>10}")
print(f" {'F1-Score (W.Avg)':<20} | {avg_f1*100:>15.2f}% | {'':>10}")
print(f"{''*70}")
# ── FOLD TERBAIK ──
best_fold_idx = max(range(N_FOLDS), key=lambda i: fold_results[i]['Accuracy'])
best_fold = fold_results[best_fold_idx]
print(f"\n Fold terbaik (accuracy tertinggi): Fold {best_fold['Fold']}")
print(f" Accuracy : {best_fold['Accuracy']*100:.2f}%")
print(f" Precision : {best_fold['Precision']*100:.2f}%")
print(f" Recall : {best_fold['Recall']*100:.2f}%")
print(f" F1-Score : {best_fold['F1-Score']*100:.2f}%")
# ── TRAIN ULANG DENGAN SELURUH DATA (MODEL FINAL) ──
print(f"\n{'='*70}")
print(f" TRAINING MODEL FINAL DENGAN SELURUH DATA")
print(f"{'='*70}")
print(f" Parameter SVM: C={best_C}, gamma={best_gamma}")
print(f" Jumlah total data : {len(X)}")
X_all_arr = np.array(df['processed_text'])
y_all_arr = np.array(df['sentiment'])
final_vectorizer = TfidfVectorizer(
max_features=5000, ngram_range=(1, 2),
lowercase=True, strip_accents='unicode'
)
X_final_tfidf = final_vectorizer.fit_transform(X_all_arr)
final_model = SVC(kernel='rbf', C=best_C, gamma=best_gamma, probability=True)
final_model.fit(X_final_tfidf, y_all_arr)
print("✓ Training model final selesai!")
# Ganti model & vectorizer analyzer dengan yang final
analyzer.model = final_model
analyzer.vectorizer = final_vectorizer
# ── NILAI ALPHA (Lagrange Multiplier) SVM ──
dual_coefs = analyzer.model.dual_coef_
alpha_values = np.abs(np.asarray(dual_coefs.todense())).flatten()
n_sv = analyzer.model.support_vectors_.shape[0]
avg_alpha = float(np.mean(alpha_values))
min_alpha = float(np.min(alpha_values))
max_alpha = float(np.max(alpha_values))
print(f"\n{''*55}")
print(f" NILAI ALPHA (Lagrange Multiplier) MODEL SVM FINAL")
print(f"{''*55}")
print(f" Keterangan : α_i diperoleh dari |dual_coef_| model")
print(f" Jumlah Support Vector : {n_sv}")
print(f" Rata-rata α : {avg_alpha:.6f}")
print(f" α Minimum : {min_alpha:.6f}")
print(f" α Maksimum : {max_alpha:.6f}")
print(f"{''*55}")
print(f" ★ Gunakan nilai Rata-rata α = {avg_alpha:.6f}")
print(f" sebagai given value untuk hitungan manual SVM di skripsi.")
print(f"{''*55}")
# ── SIMPAN METRIK KE JSON ──
import json
metrics = {
'k_fold': N_FOLDS,
'best_C': best_C,
'best_gamma': best_gamma,
'kfold_results': [
{
'fold' : r['Fold'],
'accuracy' : float(r['Accuracy']),
'precision': float(r['Precision']),
'recall' : float(r['Recall']),
'f1_score' : float(r['F1-Score']),
'TP': int(r['TP']), 'TN': int(r['TN']),
'FP': int(r['FP']), 'FN': int(r['FN']),
}
for r in fold_results
],
'average': {
'accuracy' : float(avg_acc),
'precision': float(avg_prec),
'recall' : float(avg_rec),
'f1_score' : float(avg_f1),
'std_accuracy': float(std_acc),
},
'best_fold': int(best_fold['Fold']),
'alpha': {
'average': avg_alpha,
'minimum': min_alpha,
'maximum': max_alpha,
'n_support_vectors': int(n_sv),
}
}
with open('model_metrics.json', 'w') as f:
json.dump(metrics, f, indent=4)
print("✓ Model metrics saved to model_metrics.json")
# Simpan model final
analyzer.save_model('sentiment_model.pkl')
print(f"✓ Model final berhasil disimpan ke sentiment_model.pkl!")
if __name__ == "__main__":
main()