analisis-sentimen-program-m.../dashboard.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
from sentiment_model import SentimentAnalyzer
import warnings
from dateutil import parser as date_parser
from datetime import datetime
import re
import os
warnings.filterwarnings('ignore')

# Base directory: folder tempat dashboard.py berada
# Ini memastikan path file benar di Streamlit Cloud maupun lokal
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

# Set font untuk mendukung bahasa Indonesia
plt.rcParams['font.family'] = 'DejaVu Sans'

# Konfigurasi halaman
st.set_page_config(
    page_title="Analisis Sentimen Bahasa Indonesia",
    page_icon="📊",
    layout="wide"
)

# Load CSS untuk styling
st.markdown("""
<style>
.main-header {
    font-size: 3rem;
    color: #1f77b4;
    text-align: center;
    margin-bottom: 2rem;
}
.metric-card {
    background-color: #f0f2f6;
    padding: 1rem;
    border-radius: 0.5rem;
    margin: 0.5rem 0;
}
.stAlert > div {
    padding: 1rem;
}
</style>
""", unsafe_allow_html=True)

@st.cache_data
def load_data():
    """Load dataset yang sudah diproses"""
    try:
        filepath = os.path.join(BASE_DIR, 'mbg_processed.csv')
        df = pd.read_csv(filepath)
        return df
    except FileNotFoundError:
        st.error("File mbg_processed.csv tidak ditemukan. Jalankan sentiment_model.py terlebih dahulu.")
        return None
    except Exception as e:
        st.error(f"Error memuat data: {str(e)}")
        return None

@st.cache_resource
def load_model():
    """Load model yang sudah dilatih, atau latih ulang jika pkl tidak valid"""
    analyzer = SentimentAnalyzer()
    model_path = os.path.join(BASE_DIR, 'sentiment_model.pkl')
    data_path  = os.path.join(BASE_DIR, 'data_mbg_labelled.csv')

    # --- Coba load dari pkl ---
    pkl_loaded_ok = False
    if os.path.exists(model_path):
        try:
            analyzer.load_model(model_path)
            if hasattr(analyzer.vectorizer, 'idf_'):
                pkl_loaded_ok = True
        except Exception:
            pass   # pkl rusak / versi berbeda → akan retrain

    # --- Fallback: latih ulang dari data CSV ---
    if not pkl_loaded_ok:
        if not os.path.exists(data_path):
            st.error(
                f"❌ File model ({model_path}) tidak valid DAN "
                f"dataset ({data_path}) tidak ditemukan. "
                "Pastikan kedua file sudah di-commit ke GitHub."
            )
            return None

        try:
            df = analyzer.load_and_preprocess_data(data_path)
            analyzer.train_and_evaluate_model(df)
            # Simpan pkl baru agar berikutnya langsung bisa di-load
            try:
                analyzer.save_model(model_path)
            except Exception:
                pass  # Jika tidak bisa menyimpan (read-only fs), abaikan
        except Exception as e:
            st.error(f"❌ Gagal melatih model: {str(e)}")
            return None

    return analyzer

def create_pie_chart(df):
    """Membuat pie chart distribusi sentimen"""
    sentiment_counts = df['sentiment'].value_counts()
    labels = ['Negatif', 'Positif']
    values = [sentiment_counts[0], sentiment_counts[1]]
    colors = ['#ff7f7f', '#7fbf7f']

    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        hole=0.3,
        marker=dict(colors=colors),
        texttemplate='<b>%{label}</b><br>%{value}<br>(%{percent})'
    )])

    fig.update_traces(
        textposition='inside',
        textfont_size=12
    )

    fig.update_layout(
        title={
            'text': "Distribusi Sentimen Dataset",
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        font=dict(size=12),
        showlegend=True,
        height=400
    )

    return fig

def create_wordcloud(text, title, colormap='viridis'):
    """Membuat word cloud dengan font yang mendukung bahasa Indonesia"""
    if len(text) == 0:
        return None

    # Gabungkan semua teks
    combined_text = ' '.join(text)

    if not combined_text.strip():
        return None

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap=colormap,
        max_words=100,
        relative_scaling=0.5,
        min_font_size=10,
        prefer_horizontal=0.9,
        collocations=False
    ).generate(combined_text)

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, fontsize=16, fontweight='bold', pad=20)

    return fig

def get_top_words(texts, n=20):
    """Mendapatkan kata-kata yang paling sering muncul"""
    all_words = []
    for text in texts:
        if pd.notna(text) and text.strip():
            all_words.extend(text.split())

    # Filter kata yang terlalu pendek
    all_words = [word for word in all_words if len(word) > 2]

    word_freq = Counter(all_words)
    return word_freq.most_common(n)

def detect_date_column(df):
    """Mendeteksi kolom tanggal dari berbagai nama kolom yang umum"""
    # Daftar nama kolom yang umum untuk tanggal
    date_column_names = [
        'created_at', 'date', 'timestamp', 'time', 'datetime', 'created',
        'posted_at', 'published_at', 'tanggal', 'waktu', 'created_date',
        'post_date', 'tweet_created_at', 'creation_date', 'date_created'
    ]

    # Cek kolom yang ada di dataframe (case-insensitive)
    for col in df.columns:
        col_lower = col.lower().strip()
        if col_lower in date_column_names:
            return col

    # Jika tidak ditemukan dari nama, coba deteksi dari isi kolom
    for col in df.columns:
        if df[col].dtype == 'object':  # Hanya cek kolom string
            # Ambil sample non-null values
            sample_values = df[col].dropna().head(10)
            if len(sample_values) > 0:
                # Coba parse beberapa nilai
                parsed_count = 0
                for val in sample_values:
                    if parse_date_robust(str(val)) is not None:
                        parsed_count += 1

                # Jika lebih dari 70% bisa diparsing sebagai tanggal, anggap sebagai kolom tanggal
                if parsed_count / len(sample_values) > 0.7:
                    return col

    return None

def parse_date_robust(date_string):
    """Parse tanggal dengan berbagai format secara robust"""
    if pd.isna(date_string) or not date_string or str(date_string).strip() == '':
        return None

    date_string = str(date_string).strip()

    try:
        # Coba parse dengan dateutil parser (sangat fleksibel)
        parsed_date = date_parser.parse(date_string, fuzzy=True)
        return parsed_date
    except:
        pass

    # Coba format-format khusus yang mungkin tidak ditangani dateutil
    date_formats = [
        '%Y-%m-%d %H:%M:%S',
        '%Y-%m-%d',
        '%d/%m/%Y',
        '%m/%d/%Y',
        '%d-%m-%Y',
        '%Y/%m/%d',
        '%d %b %Y',
        '%d %B %Y',
        '%b %d, %Y',
        '%B %d, %Y',
        '%Y-%m-%dT%H:%M:%S',
        '%Y-%m-%dT%H:%M:%SZ',
        '%Y-%m-%d %H:%M:%S.%f',
    ]

    for fmt in date_formats:
        try:
            return datetime.strptime(date_string, fmt)
        except:
            continue

    return None

def get_date_range(df, date_column):
    """Mendapatkan rentang tanggal dari kolom tanggal"""
    if date_column is None or date_column not in df.columns:
        return None, None

    # Parse semua tanggal
    dates = []
    for val in df[date_column].dropna():
        parsed = parse_date_robust(val)
        if parsed:
            dates.append(parsed)

    if not dates:
        return None, None

    min_date = min(dates)
    max_date = max(dates)

    return min_date, max_date

def get_temporal_statistics(df, date_column):
    """Mendapatkan statistik temporal (per tahun dan per bulan)"""
    if date_column is None or date_column not in df.columns:
        return None

    # Parse semua tanggal
    dates = []
    for val in df[date_column].dropna():
        parsed = parse_date_robust(val)
        if parsed:
            dates.append(parsed)

    if not dates:
        return None

    # Buat DataFrame dari dates untuk analisis
    df_dates = pd.DataFrame({'date': dates})

    # Ekstrak tahun dan bulan
    df_dates['year'] = df_dates['date'].dt.year
    df_dates['month'] = df_dates['date'].dt.month
    df_dates['year_month'] = df_dates['date'].dt.to_period('M').astype(str)

    # Hitung per tahun
    yearly_counts = df_dates['year'].value_counts().sort_index()

    # Hitung per bulan
    monthly_counts = df_dates['year_month'].value_counts().sort_index()

    # Konversi bulan ke format yang lebih readable
    month_names_id = {
        1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'Mei', 6: 'Jun',
        7: 'Jul', 8: 'Agt', 9: 'Sep', 10: 'Okt', 11: 'Nov', 12: 'Des'
    }

    monthly_labels = []
    for ym in monthly_counts.index:
        year, month = ym.split('-')
        month_name = month_names_id[int(month)]
        monthly_labels.append(f"{month_name} {year}")

    return {
        'yearly_counts': yearly_counts,
        'monthly_counts': monthly_counts,
        'monthly_labels': monthly_labels,
        'total_dates': len(dates)
    }

def create_temporal_charts(temporal_stats):
    """Membuat visualisasi distribusi temporal"""
    if temporal_stats is None:
        return None, None

    # Chart distribusi per tahun - Filter hanya tahun yang valid
    yearly_counts = temporal_stats['yearly_counts']

    # Filter: hanya ambil tahun yang berupa integer dan dalam range 1900-2100
    valid_years = []
    valid_counts = []
    for year, count in yearly_counts.items():
        try:
            year_int = int(year)
            if 1900 <= year_int <= 2100:
                valid_years.append(str(year_int))
                valid_counts.append(count)
        except (ValueError, TypeError):
            # Skip tahun yang tidak valid (bukan integer atau di luar range)
            continue

    fig_yearly = go.Figure([go.Bar(
        x=valid_years,
        y=valid_counts,
        marker_color='lightblue',
        text=valid_counts,
        textposition='auto',
        hovertemplate='<b>Tahun %{x}</b><br>Jumlah: %{y}<extra></extra>'
    )])

    fig_yearly.update_layout(
        title={
            'text': "Distribusi Data per Tahun",
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        xaxis_title="Tahun",
        yaxis_title="Jumlah Data",
        xaxis={'type': 'category'},  # Set as category to prevent decimal years
        height=400,
        showlegend=False,
        hovermode='x'
    )

    # Chart distribusi per bulan
    monthly_counts = temporal_stats['monthly_counts']
    monthly_labels = temporal_stats['monthly_labels']

    fig_monthly = go.Figure([go.Bar(
        x=monthly_labels,
        y=monthly_counts.values,
        marker_color='lightcoral',
        text=monthly_counts.values,
        textposition='auto',
        hovertemplate='<b>%{x}</b><br>Jumlah: %{y}<extra></extra>'
    )])

    fig_monthly.update_layout(
        title={
            'text': "Distribusi Data per Bulan",
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        xaxis_title="Bulan",
        yaxis_title="Jumlah Data",
        height=400,
        showlegend=False,
        hovermode='x',
        xaxis={'tickangle': -45}
    )

    return fig_yearly, fig_monthly

def show_preprocessing_steps(analyzer, text):
    """Menampilkan tabel langkah-langkah preprocessing"""
    if text.strip():
        processed_text, steps = analyzer.preprocess_text(text, show_steps=True)

        st.subheader("📋 Tahapan Preprocessing")

        # Buat DataFrame untuk menampilkan steps
        steps_data = [
            ["🔤 Teks Asli", steps['original']],
            ["🧹 Pembersihan", steps['cleaned']],
            ["📝 Case Folding", steps['casefolded']],
            ["✂️ Tokenisasi", str(steps['tokenized'])],
            ["🔄 Normalisasi", str(steps['normalized'])],
            ["🚫 Hapus Stopwords", str(steps['no_stopwords'])],
            ["🌱 Stemming", str(steps['stemmed'])],
            ["✅ Hasil Akhir", steps['final']]
        ]

        steps_df = pd.DataFrame(steps_data, columns=["Tahap", "Hasil"])

        # Styling untuk DataFrame
        def highlight_rows(row):
            if row.name == 0:  # Original text
                return ['background-color: #e8f4fd; color: black'] * len(row)
            elif row.name == len(steps_df) - 1:  # Final result
                return ['background-color: #e8f5e8; color: black'] * len(row)
            else:
                return ['background-color: #f9f9f9; color: black'] * len(row)

        styled_df = steps_df.style.apply(highlight_rows, axis=1)

        # Render tabel dengan HTML agar bisa discroll horizontal
        html_table = styled_df.to_html(escape=False)
        st.markdown(
            f"""
            <div style="overflow-x: auto; max-width: 100%;">
                {html_table}
            </div>
            """,
            unsafe_allow_html=True
        )

        return processed_text
    return None


def show_examples():
    """Menampilkan contoh-contoh teks untuk analisis"""
    st.subheader("💡 Contoh Teks untuk Dicoba")

    examples = [
        "MBG program yang bermanfaat untuk generasi emas!",
        "Makan Bergizi Gratis merupakan bentuk komiitmen pemerintah dalam perbaikan gizi",
        "makan bergizi gratis ini bagus banget",
        "MBG program buang buang anggaran",
        "Makan Bergizi Gratis menjadi Makan Beracun Gratis",
        "MBG menjadi ladang korupsi",
        "Stop mbg sebelum jatuh lebih banyak korban keracunan",
    ]

    for i, example in enumerate(examples, 1):
        if st.button(f"Contoh {i}: {example[:50]}...", key=f"example_{i}"):
            return example

    return None

def main():
    """Fungsi utama dashboard"""

    # Header
    st.markdown('<h1 class="main-header">📊 Analisis Sentimen Program MBG Bahasa Indonesia</h1>', unsafe_allow_html=True)

    # Load data dan model
    df = load_data()
    analyzer = load_model()

    if df is None or analyzer is None:
        st.stop()

    # Sidebar
    st.sidebar.header("🧭 Navigasi")
    page = st.sidebar.selectbox(
        "Pilih Halaman:",
        ["📈 Dashboard Utama", "🔮 Prediksi Sentimen", "📋 Demo Preprocessing", "📁 Analisis CSV"]
    )

    if page == "📈 Dashboard Utama":
        show_main_dashboard(df)
    elif page == "🔮 Prediksi Sentimen":
        show_prediction_page(analyzer)
    elif page == "📋 Demo Preprocessing":
        show_preprocessing_demo(analyzer)
    elif page == "📁 Analisis CSV":
        show_csv_analysis_page(analyzer)

def show_main_dashboard(df):
    """Menampilkan dashboard utama"""
    st.header("📊 Dashboard Analisis Sentimen")

    # Metrics
    col1, col2, col3, col4 = st.columns(4)

    total_data = len(df)
    positive_count = len(df[df['sentiment'] == 1])
    negative_count = len(df[df['sentiment'] == 0])
    positive_ratio = (positive_count / total_data) * 100

    with col1:
        st.metric("📄 Total Data", f"{total_data:,}")
    with col2:
        st.metric("😊 Sentimen Positif", f"{positive_count:,}")
    with col3:
        st.metric("😞 Sentimen Negatif", f"{negative_count:,}")
    with col4:
        st.metric("📊 Rasio Positif", f"{positive_ratio:.1f}%")

    # Charts
    col1, col2 = st.columns(2)

    with col1:
        # Pie Chart
        fig_pie = create_pie_chart(df)
        st.plotly_chart(fig_pie, use_container_width=True)

    with col2:
        # Bar Chart untuk top words secara keseluruhan
        all_texts = df['processed_text'].dropna().tolist()
        top_words = get_top_words(all_texts, 15)

        if top_words:
            words, counts = zip(*top_words)
            fig_bar = go.Figure([go.Bar(
                x=list(counts),
                y=list(words),
                orientation='h',
                marker_color='lightblue'
            )])
            fig_bar.update_layout(
                title="15 Kata Teratas (Keseluruhan)",
                xaxis_title="Frekuensi",
                yaxis_title="Kata",
                height=400
            )
            st.plotly_chart(fig_bar, use_container_width=True)

    # Word Clouds
    st.header("☁️ Word Clouds")

    col1, col2 = st.columns(2)

    with col1:
        st.subheader("😊 Sentimen Positif")
        positive_texts = df[df['sentiment'] == 1]['processed_text'].dropna().tolist()
        if positive_texts:
            fig_wc_pos = create_wordcloud(positive_texts, "Word Cloud Sentimen Positif", 'Greens')
            if fig_wc_pos:
                st.pyplot(fig_wc_pos, clear_figure=True)
        else:
            st.info("Tidak ada data sentimen positif")

    with col2:
        st.subheader("😞 Sentimen Negatif")
        negative_texts = df[df['sentiment'] == 0]['processed_text'].dropna().tolist()
        if negative_texts:
            fig_wc_neg = create_wordcloud(negative_texts, "Word Cloud Sentimen Negatif", 'Reds')
            if fig_wc_neg:
                st.pyplot(fig_wc_neg, clear_figure=True)
        else:
            st.info("Tidak ada data sentimen negatif")

    # Top words by sentiment
    st.header("📈 Kata-kata Teratas per Sentimen")

    col1, col2 = st.columns(2)

    with col1:
        st.subheader("😊 Top 15 Kata Sentimen Positif")
        if positive_texts:
            top_words_pos = get_top_words(positive_texts, 15)
            if top_words_pos:
                words_pos, counts_pos = zip(*top_words_pos)

                fig_pos = go.Figure([go.Bar(
                    x=list(counts_pos),
                    y=list(words_pos),
                    orientation='h',
                    marker_color='lightgreen'
                )])
                fig_pos.update_layout(
                    xaxis_title="Frekuensi",
                    yaxis_title="Kata",
                    height=400
                )
                st.plotly_chart(fig_pos, use_container_width=True)

    with col2:
        st.subheader("😞 Top 15 Kata Sentimen Negatif")
        if negative_texts:
            top_words_neg = get_top_words(negative_texts, 15)
            if top_words_neg:
                words_neg, counts_neg = zip(*top_words_neg)

                fig_neg = go.Figure([go.Bar(
                    x=list(counts_neg),
                    y=list(words_neg),
                    orientation='h',
                    marker_color='lightcoral'
                )])
                fig_neg.update_layout(
                    xaxis_title="Frekuensi",
                    yaxis_title="Kata",
                    height=400
                )
                st.plotly_chart(fig_neg, use_container_width=True)

    # Dataset Display Section
    st.markdown("---")
    st.header("📋 Dataset yang Digunakan untuk Training Model")

    try:
        # Load dataset
        dataset_path = os.path.join(BASE_DIR, 'data_mbg_labelled.csv')
        df_dataset = pd.read_csv(dataset_path)

        # Pilih kolom yang dibutuhkan
        if all(col in df_dataset.columns for col in ['created_at', 'text', 'sentiment']):
            display_df = df_dataset[['created_at', 'text', 'sentiment']].copy()

            # Map sentiment ke label yang lebih readable
            display_df['sentiment'] = display_df['sentiment'].map({0: 'Negatif', 1: 'Positif'})

            # Rename columns untuk display yang lebih baik
            display_df = display_df.rename(columns={
                'created_at': 'Tanggal Dibuat',
                'text': 'Teks',
                'sentiment': 'Sentimen'
            })

            # Tampilkan tabel tanpa width constraint agar text bisa wrap
            st.dataframe(
                display_df,
                use_container_width=True,
                height=400
            )
        else:
            st.warning("⚠️ Kolom yang diperlukan (created_at, text, sentiment) tidak ditemukan dalam dataset.")
    except FileNotFoundError:
        st.error(f"❌ File dataset '{dataset_path}' tidak ditemukan.")
    except Exception as e:
        st.error(f"❌ Error saat memuat dataset: {str(e)}")

    # Model Performance Section
    st.markdown("---")
    st.header("📊 Performa Model (5-Fold Cross Validation)")

    try:
        import json
        metrics_path = os.path.join(BASE_DIR, 'model_metrics.json')
        with open(metrics_path, 'r') as f:
            metrics = json.load(f)

        avg    = metrics['average']
        k_fold = metrics.get('k_fold', 5)
        best_C     = metrics.get('best_C', '-')
        best_gamma = metrics.get('best_gamma', '-')

        # ── Info model ──
        st.info(
            f"ℹ️ Model SVM (RBF kernel) dievaluasi dengan **{k_fold}-Fold Stratified Cross Validation**. "
            f"Parameter terbaik dari Grid Search: **C={best_C}**, **gamma={best_gamma}**."
        )

        # ── Kartu metrik rata-rata ──
        st.subheader("📈 Rata-Rata Metrik Semua Fold")
        col1, col2, col3, col4, col5 = st.columns(5)
        with col1:
            st.metric("🎯 Accuracy (Avg)", f"{avg['accuracy']:.2%}")
        with col2:
            st.metric("📈 Precision (Avg)", f"{avg['precision']:.2%}")
        with col3:
            st.metric("📊 Recall (Avg)", f"{avg['recall']:.2%}")
        with col4:
            st.metric("⚖️ F1-Score (Avg)", f"{avg['f1_score']:.2%}")
        with col5:
            st.metric("📉 Std Dev Accuracy", f"{avg['std_accuracy']:.2%}")

        st.markdown("---")

        # ── Tabel performa per fold ──
        st.subheader(f"📋 Hasil Per Fold ({k_fold}-Fold)")
        fold_rows = []
        for r in metrics['kfold_results']:
            fold_rows.append({
                'Fold': f"Fold {r['fold']}",
                'Accuracy':  f"{r['accuracy']:.4f}  ({r['accuracy']:.2%})",
                'Precision': f"{r['precision']:.4f}  ({r['precision']:.2%})",
                'Recall':    f"{r['recall']:.4f}  ({r['recall']:.2%})",
                'F1-Score':  f"{r['f1_score']:.4f}  ({r['f1_score']:.2%})",
                'TP': r['TP'], 'TN': r['TN'], 'FP': r['FP'], 'FN': r['FN'],
            })
        fold_df = pd.DataFrame(fold_rows)
        st.dataframe(fold_df, use_container_width=True, hide_index=True)

        # ── Fold terbaik ──
        best_fold_num = metrics.get('best_fold', '-')
        st.success(f"✅ **Fold terbaik berdasarkan Accuracy: Fold {best_fold_num}**")

        st.markdown("---")

        # ── Confusion Matrix Gabungan & Classification Report ──
        col_left, col_right = st.columns(2)

        # Hitung CM gabungan dari semua fold
        agg_tp = sum(r['TP'] for r in metrics['kfold_results'])
        agg_tn = sum(r['TN'] for r in metrics['kfold_results'])
        agg_fp = sum(r['FP'] for r in metrics['kfold_results'])
        agg_fn = sum(r['FN'] for r in metrics['kfold_results'])
        agg_total = agg_tp + agg_tn + agg_fp + agg_fn
        agg_sup_neg = agg_tn + agg_fp
        agg_sup_pos = agg_tp + agg_fn

        agg_prec_neg = agg_tn / (agg_tn + agg_fn) if (agg_tn + agg_fn) > 0 else 0
        agg_prec_pos = agg_tp / (agg_tp + agg_fp) if (agg_tp + agg_fp) > 0 else 0
        agg_rec_neg  = agg_tn / (agg_tn + agg_fp) if (agg_tn + agg_fp) > 0 else 0
        agg_rec_pos  = agg_tp / (agg_tp + agg_fn) if (agg_tp + agg_fn) > 0 else 0
        agg_f1_neg   = (2 * agg_prec_neg * agg_rec_neg / (agg_prec_neg + agg_rec_neg)
                        if (agg_prec_neg + agg_rec_neg) > 0 else 0)
        agg_f1_pos   = (2 * agg_prec_pos * agg_rec_pos / (agg_prec_pos + agg_rec_pos)
                        if (agg_prec_pos + agg_rec_pos) > 0 else 0)
        agg_prec_w   = (agg_prec_neg * agg_sup_neg + agg_prec_pos * agg_sup_pos) / agg_total
        agg_rec_w    = (agg_rec_neg  * agg_sup_neg + agg_rec_pos  * agg_sup_pos) / agg_total
        agg_f1_w     = (agg_f1_neg   * agg_sup_neg + agg_f1_pos   * agg_sup_pos) / agg_total
        agg_acc      = (agg_tp + agg_tn) / agg_total

        with col_left:
            st.subheader(f"🔢 Confusion Matrix Gabungan ({k_fold} Fold)")
            st.caption("Merupakan jumlah TP, TN, FP, FN dari seluruh fold.")

            import plotly.figure_factory as ff
            z      = [[agg_tn, agg_fp], [agg_fn, agg_tp]]
            x_labs = ['Prediksi Negatif', 'Prediksi Positif']
            y_labs = ['Aktual Negatif',   'Aktual Positif']
            z_text = [
                [f"TN: {agg_tn}", f"FP: {agg_fp}"],
                [f"FN: {agg_fn}", f"TP: {agg_tp}"],
            ]
            fig_cm = ff.create_annotated_heatmap(
                z, x=x_labs, y=y_labs,
                annotation_text=z_text,
                colorscale='Blues', showscale=True
            )
            fig_cm.update_layout(
                title=f"Confusion Matrix Gabungan ({k_fold}-Fold)",
                xaxis_title="Prediksi",
                yaxis_title="Aktual",
                height=400
            )
            st.plotly_chart(fig_cm, use_container_width=True)

            st.markdown(f"""
**Keterangan nilai:**
- **TN** = {agg_tn} &emsp; **FP** = {agg_fp}
- **FN** = {agg_fn} &emsp; **TP** = {agg_tp}
- **Total data** = {agg_total}
- **Accuracy (CM Gabungan)** = (TP+TN)/Total = ({agg_tp}+{agg_tn})/{agg_total} = **{agg_acc:.2%}**

**Penjelasan:**
- **TN (True Negative)**: Prediksi Negatif, Aktual Negatif ✅
- **TP (True Positive)**: Prediksi Positif, Aktual Positif ✅
- **FP (False Positive)**: Prediksi Positif, Aktual Negatif ❌ (Error Tipe I)
- **FN (False Negative)**: Prediksi Negatif, Aktual Positif ❌ (Error Tipe II)
""")

        with col_right:
            st.subheader("📋 Classification Report (CM Gabungan)")

            report_data = [
                {
                    'Kelas'    : 'Negatif',
                    'Precision': f"{agg_prec_neg:.4f}  ({agg_prec_neg:.2%})",
                    'Recall'   : f"{agg_rec_neg:.4f}  ({agg_rec_neg:.2%})",
                    'F1-Score' : f"{agg_f1_neg:.4f}  ({agg_f1_neg:.2%})",
                    'Support'  : agg_sup_neg,
                },
                {
                    'Kelas'    : 'Positif',
                    'Precision': f"{agg_prec_pos:.4f}  ({agg_prec_pos:.2%})",
                    'Recall'   : f"{agg_rec_pos:.4f}  ({agg_rec_pos:.2%})",
                    'F1-Score' : f"{agg_f1_pos:.4f}  ({agg_f1_pos:.2%})",
                    'Support'  : agg_sup_pos,
                },
                {
                    'Kelas'    : 'Weighted Avg',
                    'Precision': f"{agg_prec_w:.4f}  ({agg_prec_w:.2%})",
                    'Recall'   : f"{agg_rec_w:.4f}  ({agg_rec_w:.2%})",
                    'F1-Score' : f"{agg_f1_w:.4f}  ({agg_f1_w:.2%})",
                    'Support'  : agg_total,
                },
            ]
            report_df = pd.DataFrame(report_data)
            st.dataframe(report_df, use_container_width=True, hide_index=True)

            st.markdown("""
**Penjelasan Metrik:**
- **Precision**: Dari semua prediksi kelas X, berapa persen yang benar?
- **Recall**: Dari semua data aktual kelas X, berapa persen yang berhasil diprediksi?
- **F1-Score**: Harmonic mean dari Precision dan Recall
- **Weighted Avg**: Rata-rata tertimbang berdasarkan support (jumlah data aktual per kelas)
""")

            # Bar chart perbandingan metrik per kelas
            fig_bar = go.Figure()
            for metric_label, neg_val, pos_val in [
                ('Precision', agg_prec_neg, agg_prec_pos),
                ('Recall',    agg_rec_neg,  agg_rec_pos),
                ('F1-Score',  agg_f1_neg,   agg_f1_pos),
            ]:
                fig_bar.add_trace(go.Bar(
                    name=metric_label,
                    x=['Negatif', 'Positif'],
                    y=[neg_val, pos_val],
                    text=[f"{neg_val:.2%}", f"{pos_val:.2%}"],
                    textposition='auto'
                ))
            # Zoom y-axis ke sekitar nilai aktual agar perbedaan bar terlihat
            all_metric_vals = [agg_prec_neg, agg_prec_pos,
                               agg_rec_neg,  agg_rec_pos,
                               agg_f1_neg,   agg_f1_pos]
            bar_ymin = max(0, min(all_metric_vals) - 0.05)  # 5pp di bawah nilai terkecil
            bar_ymax = min(1, max(all_metric_vals) + 0.03)  # 3pp di atas nilai terbesar
            fig_bar.update_layout(
                title="Perbandingan Metrik per Kelas (CM Gabungan)",
                xaxis_title="Kelas",
                yaxis_title="Score",
                barmode='group',
                height=350,
                yaxis=dict(range=[bar_ymin, bar_ymax])
            )
            st.plotly_chart(fig_bar, use_container_width=True)

        st.markdown("---")

        # ── Line chart akurasi per fold ──
        st.subheader("📉 Akurasi per Fold")
        fold_nums  = [r['fold']     for r in metrics['kfold_results']]
        fold_accs  = [r['accuracy'] for r in metrics['kfold_results']]
        fig_line = go.Figure()
        # Zoom y-axis ke sekitar nilai aktual; beri ruang atas agar label tidak terpotong
        line_ymin = max(0, min(fold_accs) - 0.05)
        line_ymax = min(1, max(fold_accs) + 0.06)  # ruang ekstra untuk label teks
        fig_line.add_trace(go.Scatter(
            x=[f"Fold {n}" for n in fold_nums],
            y=fold_accs,
            mode='lines+markers+text',
            text=[f"{v:.2%}" for v in fold_accs],
            textposition='bottom center',  # di bawah titik agar tidak terpotong batas atas
            marker=dict(size=10, color='steelblue'),
            line=dict(width=2),
            name='Accuracy'
        ))
        fig_line.add_hline(
            y=avg['accuracy'],
            line_dash='dash',
            line_color='red',
            annotation_text=f"Rata-rata: {avg['accuracy']:.2%}",
            annotation_position='bottom right'
        )
        fig_line.update_layout(
            title="Akurasi Setiap Fold",
            xaxis_title="Fold",
            yaxis_title="Accuracy",
            height=400,
            yaxis=dict(range=[line_ymin, line_ymax]),
            margin=dict(t=50, b=60)  # beri ruang atas & bawah
        )
        st.plotly_chart(fig_line, use_container_width=True)

    except FileNotFoundError:
        st.warning("⚠️ File model_metrics.json tidak ditemukan. Jalankan sentiment_model.py terlebih dahulu.")
    except Exception as e:
        st.error(f"❌ Error saat memuat metrik model: {str(e)}")


def show_prediction_page(analyzer):
    """Halaman prediksi sentimen"""
    st.header("🔮 Prediksi Sentimen")

    st.write("Masukkan teks bahasa Indonesia untuk memprediksi sentimennya:")

    # # Contoh teks
    # selected_example = show_examples()

    # Input teks
    default_text = "MBG bermanfaat bagi anak-anak dan orangtua"
    user_input = st.text_area(
        "Teks untuk dianalisis:",
        height=100,
        value=default_text,
        placeholder="Contoh: Pelayanannya sangat memuaskan dan staffnya ramah sekali!"
    )

    if st.button("🚀 Analisis Sentimen", type="primary"):
        # Validasi input kosong
        if not user_input.strip():
            st.warning("⚠️ Silakan masukkan teks terlebih dahulu.")
        else:
            # Cek apakah input hanya simbol/tanda baca
            cleaned_for_check = re.sub(r'[^\w\s]', '', user_input).strip()
            if not cleaned_for_check:
                st.error("⚠️ Input hanya berisi simbol atau tanda baca. Tidak dapat melakukan analisis sentimen karena tidak ada teks bermakna.")
            else:
                with st.spinner("Menganalisis sentimen..."):
                    # Prediksi
                    result = analyzer.predict_sentiment(user_input)

                    # Cek jika hasil tidak dapat ditentukan
                    if result['sentiment'] == 'Tidak dapat menentukan':
                        st.warning("⚠️ Tidak dapat menentukan sentimen. Teks mungkin terlalu pendek atau tidak mengandung kata bermakna setelah preprocessing.")

                    # Tampilkan hasil
                    col1, col2 = st.columns(2)

                    with col1:
                        # Hasil prediksi
                        if result['sentiment'] == 'Tidak dapat menentukan':
                            sentiment_color = "gray"
                            sentiment_icon = "😐"
                        else:
                            sentiment_color = "green" if result['sentiment'] == "Positif" else "red"
                            sentiment_icon = "😊" if result['sentiment'] == "Positif" else "😞"

                        st.markdown(f"""
                        <div style="padding: 2rem; border: 3px solid {sentiment_color}; border-radius: 1rem; text-align: center; background-color: rgba({'128,128,128' if sentiment_color == 'gray' else '0,255,0' if sentiment_color == 'green' else '255,0,0'}, 0.1);">
                            <h2 style="color: {sentiment_color}; margin: 0;">{sentiment_icon} {result['sentiment']}</h2>
                            <h3 style="margin: 0.5rem 0;">Confidence: {result['confidence']:.1%}</h3>
                        </div>
                        """, unsafe_allow_html=True)

                    with col2:
                        # Probability chart
                        fig = go.Figure([go.Bar(
                            x=['😞 Negatif', '😊 Positif'],
                            y=[result['probability_negative'], result['probability_positive']],
                            marker_color=['lightcoral', 'lightgreen'],
                            text=[f"{result['probability_negative']:.1%}", f"{result['probability_positive']:.1%}"],
                            textposition='auto'
                        )])
                        fig.update_layout(
                            title="Probabilitas Sentimen",
                            yaxis_title="Probabilitas",
                            height=300,
                            showlegend=False
                        )
                        st.plotly_chart(fig, use_container_width=True)

                    # Preprocessing steps
                    st.markdown("---")
                    show_preprocessing_steps(analyzer, user_input)


def show_preprocessing_demo(analyzer):
    """Halaman demo preprocessing"""
    st.header("📋 Demo Preprocessing")

    st.write("Lihat bagaimana teks bahasa Indonesia diproses melalui setiap tahap preprocessing:")

    # Input teks
    demo_text = st.text_area(
        "Masukkan teks untuk melihat proses preprocessing:",
        height=100,
        placeholder="Contoh: MBG sangat bermanfaat bagi anak-anak dan orangtua!😊",
        value="MBG sangat bermanfaat bagi anak-anak dan orangtua!😊"
    )

    if st.button("🔍 Jalankan Preprocessing", type="primary"):
        # Validasi input kosong
        if not demo_text.strip():
            st.warning("⚠️ Silakan masukkan teks terlebih dahulu.")
        else:
            # Cek apakah input hanya simbol/tanda baca
            cleaned_for_check = re.sub(r'[^\w\s]', '', demo_text).strip()
            if not cleaned_for_check:
                st.warning("⚠️ Input hanya berisi simbol atau tanda baca. Sistem akan tetap memproses tetapi hasil mungkin kosong setelah pembersihan.")

            show_preprocessing_steps(analyzer, demo_text)

        # Penjelasan setiap tahap
        st.subheader("📚 Penjelasan Tahapan:")

        explanations = {
            "🔤 Teks Asli": "Teks input yang belum diproses",
            "🧹 Pembersihan": "Menghapus URL, mention, hashtag, angka, emoji, dan karakter khusus",
            "📝 Case Folding": "Mengubah semua huruf menjadi huruf kecil untuk konsistensi",
            "✂️ Tokenisasi": "Memecah teks menjadi token/kata individual",
            "🔄 Normalisasi": "Mengubah singkatan dan slang menjadi bentuk baku (contoh: 'bgt' → 'sangat')",
            "🚫 Hapus Stopwords": "Menghapus kata-kata umum bahasa Indonesia yang tidak bermakna",
            "🌱 Stemming": "Mengubah kata ke bentuk dasarnya menggunakan algoritma Sastrawi",
            "✅ Hasil Akhir": "Teks yang sudah siap untuk dianalisis oleh model machine learning"
        }

        for stage, explanation in explanations.items():
            st.write(f"**{stage}**: {explanation}")

        # Tips untuk preprocessing
        st.subheader("💡 Tips Preprocessing Bahasa Indonesia:")
        st.info("""
        - **Normalisasi** sangat penting untuk bahasa Indonesia karena banyaknya singkatan dan slang
        - **Stemming** menggunakan algoritma Sastrawi yang dirancang khusus untuk bahasa Indonesia
        - **Stopwords** disesuaikan dengan kata-kata umum bahasa Indonesia
        - **Cleaning** menghapus noise seperti emoji dan karakter khusus yang sering muncul di media sosial
        """)
    else:
        st.info("ℹ️ Masukkan teks untuk melihat tahapan preprocessing.")


def process_csv_predictions(analyzer, df_csv, text_column, confidence_threshold=0.5):
    """Memproses prediksi sentimen untuk CSV dengan filtering confidence"""
    results = []

    for idx, row in df_csv.iterrows():
        text = str(row[text_column])
        if pd.isna(text) or not text.strip():
            continue

        # Prediksi sentimen
        prediction = analyzer.predict_sentiment(text)

        # Simpan hasil
        results.append({
            'text': text,
            'sentiment': prediction['sentiment'],
            'confidence': prediction['confidence'],
            'probability_negative': prediction['probability_negative'],
            'probability_positive': prediction['probability_positive']
        })

    # Buat DataFrame hasil
    df_results = pd.DataFrame(results)

    # Filter berdasarkan confidence threshold
    df_confident = df_results[df_results['confidence'] >= confidence_threshold].copy()
    df_neutral = df_results[df_results['confidence'] < confidence_threshold].copy()

    return df_confident, df_neutral

def create_csv_pie_chart(df_results):
    """Membuat pie chart distribusi sentimen untuk hasil CSV"""
    if df_results.empty:
        return None

    sentiment_counts = df_results['sentiment'].value_counts()
    labels = []
    values = []
    colors = []

    if 'Negatif' in sentiment_counts.index:
        labels.append('Negatif')
        values.append(sentiment_counts['Negatif'])
        colors.append('#ff7f7f')

    if 'Positif' in sentiment_counts.index:
        labels.append('Positif')
        values.append(sentiment_counts['Positif'])
        colors.append('#7fbf7f')

    if not labels:
        return None

    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        hole=0.3,
        marker=dict(colors=colors),
        texttemplate='<b>%{label}</b><br>%{value}<br>(%{percent})'
    )])

    fig.update_traces(
        textposition='inside',
        textfont_size=12
    )

    fig.update_layout(
        title={
            'text': "Distribusi Sentimen CSV",
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        font=dict(size=12),
        showlegend=True,
        height=400
    )

    return fig

def show_csv_analysis_page(analyzer):
    """Halaman analisis CSV"""
    st.header("📁 Analisis Sentimen dari CSV")

    st.write("""
    Upload file CSV untuk melakukan prediksi sentimen secara batch.
    File CSV harus memiliki kolom **'text'** atau **'full_text'** yang berisi teks untuk dianalisis.
    """)

    # Upload CSV
    uploaded_file = st.file_uploader(
        "Upload file CSV",
        type=['csv'],
        help="File CSV harus memiliki kolom 'text' atau 'full_text'"
    )

    if uploaded_file is not None:
        try:
            # Baca CSV
            df_csv = pd.read_csv(uploaded_file)

            # Validasi CSV tidak kosong
            if len(df_csv) == 0:
                st.warning("⚠️ File CSV tidak memiliki data. Silakan upload file dengan data yang valid.")
                return

            st.success(f"✅ File berhasil diupload! Total baris: {len(df_csv):,}")

            # Deteksi kolom tanggal dan analisis temporal
            date_column = detect_date_column(df_csv)
            if date_column:
                min_date, max_date = get_date_range(df_csv, date_column)
                temporal_stats = get_temporal_statistics(df_csv, date_column)

                if min_date and max_date and temporal_stats:
                    # Tampilkan informasi rentang waktu
                    st.markdown("---")
                    st.subheader("📅 Informasi Temporal Data")

                    col1, col2 = st.columns([2, 1])
                    with col1:
                        st.info(f"**Rentang Waktu:** {min_date.strftime('%d %B %Y, %H:%M:%S')} sampai {max_date.strftime('%d %B %Y, %H:%M:%S')}")
                    with col2:
                        st.metric("📊 Total Data Bertanggal", f"{temporal_stats['total_dates']:,}")

                    st.caption(f"Kolom tanggal yang terdeteksi: **{date_column}**")

                    # Tampilkan chart temporal
                    fig_yearly, fig_monthly = create_temporal_charts(temporal_stats)

                    if fig_yearly and fig_monthly:
                        col1, col2 = st.columns(2)

                        with col1:
                            st.plotly_chart(fig_yearly, use_container_width=True)

                        with col2:
                            st.plotly_chart(fig_monthly, use_container_width=True)

                    st.markdown("---")
                else:
                    st.warning(f"⚠️ Kolom tanggal terdeteksi ({date_column}) tetapi tidak dapat mem-parsing tanggal")
            else:
                st.info("ℹ️ Tidak ada kolom tanggal yang terdeteksi dalam CSV")

            # Validasi kolom
            text_column = None
            if 'text' in df_csv.columns:
                text_column = 'text'
            elif 'full_text' in df_csv.columns:
                text_column = 'full_text'
            else:
                st.error("❌ File CSV harus memiliki kolom 'text' atau 'full_text'")
                st.info(f"Kolom yang tersedia: {', '.join(df_csv.columns)}")
                return

            st.info(f"📝 Menggunakan kolom teks: **{text_column}**")

            # Tampilkan preview
            with st.expander("👀 Preview Data CSV"):
                st.dataframe(df_csv.head(10), use_container_width=True)

            # Confidence threshold
            confidence_threshold = st.slider(
                "Confidence Threshold ",
                min_value=0.0,
                max_value=1.0,
                value=0.5,
                step=0.05,
                help="Prediksi dengan confidence di bawah threshold ini tidak dimasukkan dalam analisis"
            )

            # Tombol proses
            if st.button("🚀 Proses Prediksi Sentimen", type="primary"):
                with st.spinner("Memproses prediksi sentimen..."):
                    # Proses prediksi
                    df_confident, df_neutral = process_csv_predictions(
                        analyzer, df_csv, text_column, confidence_threshold
                    )

                    # Simpan ke session state
                    st.session_state['csv_results'] = df_confident
                    st.session_state['csv_neutral'] = df_neutral
                    st.session_state['csv_processed'] = True

                st.success("✅ Prediksi selesai!")
                st.rerun()

        except Exception as e:
            st.error(f"❌ Error membaca file CSV: {str(e)}")
            return

    # Tampilkan hasil jika sudah diproses
    if st.session_state.get('csv_processed', False):
        df_confident = st.session_state.get('csv_results')
        df_neutral = st.session_state.get('csv_neutral')

        if df_confident is not None:
            st.markdown("---")
            st.header("📊 Hasil Analisis")

            # Metrics
            col1, col2, col3, col4 = st.columns(4)

            total_data = len(df_confident) + len(df_neutral)
            positive_count = len(df_confident[df_confident['sentiment'] == 'Positif'])
            negative_count = len(df_confident[df_confident['sentiment'] == 'Negatif'])
            neutral_count = len(df_neutral)

            with col1:
                st.metric("📄 Total Data", f"{total_data:,}")
            with col2:
                st.metric("😊 Sentimen Positif", f"{positive_count:,}")
            with col3:
                st.metric("😞 Sentimen Negatif", f"{negative_count:,}")
            with col4:
                st.metric("😐 Confidence Level < Threshold", f"{neutral_count:,}")

            # Visualisasi
            if not df_confident.empty:
                st.markdown("---")
                st.subheader("📈 Visualisasi Sentimen")

                col1, col2 = st.columns(2)

                with col1:
                    # Pie Chart
                    fig_pie = create_csv_pie_chart(df_confident)
                    if fig_pie:
                        st.plotly_chart(fig_pie, use_container_width=True)

                with col2:
                    # Bar Chart distribusi confidence
                    fig_conf = go.Figure([go.Histogram(
                        x=df_confident['confidence'],
                        nbinsx=20,
                        marker_color='lightblue'
                    )])
                    fig_conf.update_layout(
                        title="Distribusi Confidence Score",
                        xaxis_title="Confidence",
                        yaxis_title="Frekuensi",
                        height=400
                    )
                    st.plotly_chart(fig_conf, use_container_width=True)

                # Word Clouds
                st.markdown("---")
                st.subheader("☁️ Word Clouds")

                col1, col2 = st.columns(2)

                with col1:
                    st.markdown("**😊 Sentimen Positif**")
                    positive_texts = df_confident[df_confident['sentiment'] == 'Positif']['text'].tolist()
                    if positive_texts:
                        # Preprocess texts untuk wordcloud
                        processed_positive = [analyzer.preprocess_text(text) for text in positive_texts]
                        fig_wc_pos = create_wordcloud(processed_positive, "Word Cloud Sentimen Positif", 'Greens')
                        if fig_wc_pos:
                            st.pyplot(fig_wc_pos, clear_figure=True)
                    else:
                        st.info("Tidak ada data sentimen positif")

                with col2:
                    st.markdown("**😞 Sentimen Negatif**")
                    negative_texts = df_confident[df_confident['sentiment'] == 'Negatif']['text'].tolist()
                    if negative_texts:
                        # Preprocess texts untuk wordcloud
                        processed_negative = [analyzer.preprocess_text(text) for text in negative_texts]
                        fig_wc_neg = create_wordcloud(processed_negative, "Word Cloud Sentimen Negatif", 'Reds')
                        if fig_wc_neg:
                            st.pyplot(fig_wc_neg, clear_figure=True)
                    else:
                        st.info("Tidak ada data sentimen negatif")

                # Top words by sentiment
                st.markdown("---")
                st.subheader("📈 Kata-kata Teratas per Sentimen")

                col1, col2 = st.columns(2)

                with col1:
                    st.markdown("**😊 Top 15 Kata Sentimen Positif**")
                    if positive_texts:
                        processed_positive = [analyzer.preprocess_text(text) for text in positive_texts]
                        top_words_pos = get_top_words(processed_positive, 15)
                        if top_words_pos:
                            words_pos, counts_pos = zip(*top_words_pos)

                            fig_pos = go.Figure([go.Bar(
                                x=list(counts_pos),
                                y=list(words_pos),
                                orientation='h',
                                marker_color='lightgreen'
                            )])
                            fig_pos.update_layout(
                                xaxis_title="Frekuensi",
                                yaxis_title="Kata",
                                height=400
                            )
                            st.plotly_chart(fig_pos, use_container_width=True)

                with col2:
                    st.markdown("**😞 Top 15 Kata Sentimen Negatif**")
                    if negative_texts:
                        processed_negative = [analyzer.preprocess_text(text) for text in negative_texts]
                        top_words_neg = get_top_words(processed_negative, 15)
                        if top_words_neg:
                            words_neg, counts_neg = zip(*top_words_neg)

                            fig_neg = go.Figure([go.Bar(
                                x=list(counts_neg),
                                y=list(words_neg),
                                orientation='h',
                                marker_color='lightcoral'
                            )])
                            fig_neg.update_layout(
                                xaxis_title="Frekuensi",
                                yaxis_title="Kata",
                                height=400
                            )
                            st.plotly_chart(fig_neg, use_container_width=True)

            # Tabel hasil
            st.markdown("---")
            st.subheader("📋 Tabel Hasil Prediksi")

            if not df_confident.empty:
                # Format tabel untuk ditampilkan
                st.subheader("📊 Hasil Prediksi (Confident)")
                st.write(f"Total data dengan confidence ≥ {confidence_threshold}: **{len(df_confident):,}**")

                # Prepare display dataframe
                df_display = df_confident[['text', 'sentiment', 'confidence']].copy()
                df_display.columns = ['Teks', 'Sentimen', 'Confidence']

                # Tampilkan tabel tanpa width constraint agar text bisa wrap
                st.dataframe(
                    df_display,
                    use_container_width=True,
                    height=400
                )

                # Download hasil
                csv_download = df_confident.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label="📥 Download Hasil Prediksi (CSV)",
                    data=csv_download,
                    file_name="hasil_prediksi_sentimen.csv",
                    mime="text/csv"
                )

                # ===== BAGIAN KESIMPULAN/SUMMARY =====
                st.markdown("---")
                st.subheader("📝 Kesimpulan Analisis Sentimen")

                # Hitung statistik untuk kesimpulan
                total_analyzed = len(df_confident) + len(df_neutral)
                pct_positive = (positive_count / total_analyzed * 100) if total_analyzed > 0 else 0
                pct_negative = (negative_count / total_analyzed * 100) if total_analyzed > 0 else 0
                pct_neutral = (neutral_count / total_analyzed * 100) if total_analyzed > 0 else 0

                avg_confidence = df_confident['confidence'].mean() if not df_confident.empty else 0

                # Tentukan sentimen dominan
                if positive_count > negative_count:
                    dominant_sentiment = "Positif"
                    dominant_icon = "😊"
                    dominant_color = "green"
                elif negative_count > positive_count:
                    dominant_sentiment = "Negatif"
                    dominant_icon = "😞"
                    dominant_color = "red"
                else:
                    dominant_sentiment = "Seimbang"
                    dominant_icon = "😐"
                    dominant_color = "gray"

                # Build interpretasi list
                interpretasi_list = []

                # Interpretasi berdasarkan distribusi sentimen
                if pct_positive > 70:
                    interpretasi_list.append("Data menunjukkan **sentimen sangat positif** dengan lebih dari 70% respons positif.")
                    interpretasi_list.append("Ini mengindikasikan tingkat kepuasan atau penerimaan yang sangat baik.")
                elif pct_positive > 50:
                    interpretasi_list.append("Data menunjukkan **sentimen cenderung positif** dengan mayoritas respons positif.")
                    interpretasi_list.append("Secara umum, terdapat penerimaan yang baik meskipun masih ada ruang untuk perbaikan.")
                elif pct_negative > 70:
                    interpretasi_list.append("Data menunjukkan **sentimen sangat negatif** dengan lebih dari 70% respons negatif.")
                    interpretasi_list.append("Ini mengindikasikan adanya masalah serius yang perlu segera ditangani.")
                elif pct_negative > 50:
                    interpretasi_list.append("Data menunjukkan **sentimen cenderung negatif** dengan mayoritas respons negatif.")
                    interpretasi_list.append("Diperlukan perhatian khusus untuk meningkatkan kualitas atau layanan.")
                else:
                    interpretasi_list.append("Data menunjukkan **sentimen yang seimbang** antara positif dan negatif.")
                    interpretasi_list.append("Terdapat opini yang beragam, menunjukkan pengalaman yang bervariasi.")

                # Interpretasi berdasarkan confidence
                if avg_confidence > 0.8:
                    interpretasi_list.append(f"Confidence score yang tinggi ({avg_confidence:.1%}) menunjukkan model sangat yakin dengan prediksinya.")
                    interpretasi_list.append("Hasil analisis dapat diandalkan untuk pengambilan keputusan.")
                elif avg_confidence > 0.6:
                    interpretasi_list.append(f"Confidence score yang cukup baik ({avg_confidence:.1%}) menunjukkan prediksi yang dapat diandalkan.")
                    interpretasi_list.append("Sebagian besar hasil analisis dapat dipercaya.")
                else:
                    interpretasi_list.append(f"Confidence score yang moderat ({avg_confidence:.1%}) menunjukkan beberapa prediksi mungkin ambigu.")
                    interpretasi_list.append("Disarankan untuk melakukan validasi manual pada data dengan confidence rendah.")

                # Interpretasi data netral
                if pct_neutral > 30:
                    interpretasi_list.append(f"Terdapat **{pct_neutral:.1f}%** data dengan confidence level < threshold.")
                    interpretasi_list.append("Data dengan confidence level < threshold yang tinggi mungkin mengindikasikan teks yang ambigu atau memerlukan konteks tambahan.")

                # Build rekomendasi list
                rekomendasi_list = []

                if pct_negative > 40:
                    rekomendasi_list.append("Identifikasi tema atau topik utama dari sentimen negatif untuk perbaikan.")
                    rekomendasi_list.append("Lakukan analisis lebih lanjut pada kata-kata yang sering muncul di sentimen negatif.")

                if pct_neutral > 20:
                    rekomendasi_list.append("Review data dengan confidence level < threshold secara manual untuk memahami konteks yang lebih baik.")
                    rekomendasi_list.append("Pertimbangkan untuk meningkatkan threshold confidence jika diperlukan hasil yang lebih pasti.")

                rekomendasi_list.append("Gunakan word cloud dan top words untuk memahami topik yang paling sering dibahas.")
                rekomendasi_list.append("Pantau tren sentimen secara berkala untuk melihat perubahan dari waktu ke waktu.")

                # Tampilkan kesimpulan dalam card dengan st.container
                with st.container():
                    st.markdown(f"""
                    <div style="background-color: #f0f2f6; padding: 1.5rem; border-radius: 0.5rem; border-left: 5px solid {dominant_color};">
                        <h4 style="margin-top: 0; color: {dominant_color};">{dominant_icon} Sentimen Dominan: {dominant_sentiment}</h4>
                    </div>
                    """, unsafe_allow_html=True)

                    st.markdown("**Ringkasan Hasil Analisis:**")
                    st.markdown(f"- Dari **{total_analyzed:,}** data yang dianalisis, terdapat:")
                    st.markdown(f"  - **{positive_count:,}** sentimen positif ({pct_positive:.1f}%)")
                    st.markdown(f"  - **{negative_count:,}** sentimen negatif ({pct_negative:.1f}%)")
                    st.markdown(f"  - **{neutral_count:,}** data dengan confidence level < threshold ({pct_neutral:.1f}%)")
                    st.markdown(f"- Rata-rata confidence score: **{avg_confidence:.1%}**")

                    st.markdown("**Interpretasi:**")
                    for item in interpretasi_list:
                        st.markdown(f"- {item}")

                    st.markdown("**Rekomendasi:**")
                    for item in rekomendasi_list:
                        st.markdown(f"- {item}")

            else:
                st.warning("⚠️ Tidak ada data dengan confidence di atas threshold")

            # Tabel data netral
            if not df_neutral.empty:
                st.markdown("---")
                st.subheader(f"😐 Data (Confidence < threshold) - Total: {len(df_neutral):,}")

                with st.expander(f"📋 Data dengan Confidence Level < Threshold ({confidence_threshold})"):
                    st.write(f"Total data dengan confidence level < threshold: **{len(df_neutral):,}**")
                    st.caption("Data ini memiliki confidence di bawah threshold dan dianggap tidak pasti")

                    # Prepare display dataframe
                    df_neutral_display = df_neutral[['text', 'sentiment', 'confidence']].copy()
                    df_neutral_display.columns = ['Teks', 'Sentimen', 'Confidence']

                    # Tampilkan tabel tanpa width constraint agar text bisa wrap
                    st.dataframe(
                        df_neutral_display,
                        use_container_width=True,
                        height=300
                    )

                    # Download data netral
                    csv_neutral_download = df_neutral.to_csv(index=False).encode('utf-8')
                    st.download_button(
                        label="📥 Download Data dengan Confidence Score dibawah Threshold (CSV)",
                        data=csv_neutral_download,
                        file_name="data_confidence_level_dibawah_threshold.csv",
                        mime="text/csv"
                    )

if __name__ == "__main__":
    main()