TIFNJK_E41221742_Renaldi-En.../views/visualisasi.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
import os
import math

def render_visualisasi():
    st.title("📈 Dashboard Visualisasi Data")
    st.markdown("Analisis visual interaktif terhadap data opini publik terkait kebijakan anggaran pendidikan.")

    # ==============================================================================
    # 1. LOAD DATA UTAMA
    # ==============================================================================
    file_path = 'data/Data_Lengkap_Tokenisasi.csv'

    if not os.path.exists(file_path):
        st.error(f"❌ File dataset tidak ditemukan di: {file_path}")
        return

    # Load Data
    df = pd.read_csv(file_path)

    if 'Label' in df.columns:
        df['Label_Clean'] = df['Label'].astype(str).str.lower().str.strip()
    else:
        st.error("❌ Kolom 'Label' tidak ditemukan dalam CSV.")
        return

    if 'created_at' in df.columns:
        df['Tanggal'] = pd.to_datetime(df['created_at']).dt.date
    elif 'Tanggal' in df.columns:
        df['Tanggal'] = pd.to_datetime(df['Tanggal']).dt.date
    else:
        st.warning("⚠️ Kolom tanggal tidak ditemukan. Grafik tren waktu mungkin tidak muncul.")

    # ==============================================================================
    # 2. VISUALISASI DISTRIBUSI SENTIMEN (PIE & BAR)
    # ==============================================================================
    st.subheader("📊 Distribusi & Polaritas Sentimen")

    col_pie, col_bar = st.columns([1, 1.5])

    # --- A. PIE CHART ---
    with col_pie:
        df_pie = df['Label_Clean'].value_counts().reset_index()
        df_pie.columns = ['Sentimen', 'Jumlah']

        fig_pie = px.pie(
            df_pie,
            names='Sentimen',
            values='Jumlah',
            hole=0.4,
            color='Sentimen',
            color_discrete_map={'negatif':'#FF4B4B', 'netral':'#808495', 'positif':'#00CC96'},
            title="Persentase Sentimen"
        )
        fig_pie.update_layout(showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=-0.3, xanchor="center", x=0.5))
        st.plotly_chart(fig_pie, use_container_width=True)

    # --- B. TREN WAKTU ---
    with col_bar:
        if 'Tanggal' in df.columns:
            start_date = pd.to_datetime("2025-02-01").date()
            end_date = pd.to_datetime("2025-03-31").date()

            df_filtered = df[
                (df['Tanggal'] >= start_date) &
                (df['Tanggal'] <= end_date)
            ]

            kolom_label = 'Label' if 'Label' in df_filtered.columns else 'Label_Clean'

            df_trend = df_filtered.groupby(['Tanggal', kolom_label]).size().reset_index(name='Jumlah')

            fig_trend = px.line(
                df_trend,
                x='Tanggal',
                y='Jumlah',
                color=kolom_label,
                markers=True,
                color_discrete_map={
                    'negatif':'#FF4B4B', 'netral':'#808495', 'positif':'#00CC96',
                    'Negatif':'#FF4B4B', 'Netral':'#808495', 'Positif':'#00CC96',
                    'negative':'#FF4B4B', 'neutral':'#808495', 'positive':'#00CC96'
                },
                title="Tren Sentimen Harian (Feb - Mar 2025)"
            )

            fig_trend.update_xaxes(range=[start_date, end_date])
            fig_trend.update_layout(xaxis_title="Tanggal", yaxis_title="Jumlah Tweet", hovermode="x unified", legend=dict(orientation="h", y=1.1))

            st.plotly_chart(fig_trend, use_container_width=True)
        else:
            st.info("Data Tanggal tidak tersedia untuk menampilkan tren.")

    # ==============================================================================
    # 3. WORDCLOUD
    # ==============================================================================
    st.subheader("☁️ WordCloud: Representasi Visual Teks")
    st.write("Kata-kata yang paling sering muncul dalam setiap kategori.")

    # 1. Fungsi Asli untuk generate dari Teks (Data Mentah & Bersih)
    def generate_wc(text, colormap):
        if not isinstance(text, str) or not text.strip():
            st.warning("⚠️ Tidak ada data teks yang cukup.")
            return

        with st.spinner("Sedang menggambar WordCloud..."):
            try:
                wc = WordCloud(width=800, height=400, background_color='white', colormap=colormap, max_words=100).generate(text)
                fig, ax = plt.subplots(figsize=(10, 5))
                ax.imshow(wc, interpolation='bilinear')
                ax.axis("off")
                st.pyplot(fig)
            except Exception as e:
                st.error(f"Error WordCloud: {e}")

    # 2. FUNGSI BARU: Generate WordCloud langsung dari CSV Frekuensi agar instan
    def generate_wc_from_freq(file_path, colormap):
        if os.path.exists(file_path):
            try:
                df_freq = pd.read_csv(file_path)
                # Mengubah format DataFrame menjadi Dictionary (Syarat mutlak WordCloud)
                freq_dict = dict(zip(df_freq['Word'], df_freq['Frequency']))

                with st.spinner("Merender WordCloud instan dari CSV..."):
                    wc = WordCloud(width=800, height=400, background_color='white', colormap=colormap, max_words=100)
                    wc.generate_from_frequencies(freq_dict)

                    fig, ax = plt.subplots(figsize=(10, 5))
                    ax.imshow(wc, interpolation='bilinear')
                    ax.axis("off")
                    st.pyplot(fig)
            except Exception as e:
                st.error(f"Error memproses file CSV WordFreq: {e}")
        else:
            st.warning(f"⚠️ File frekuensi belum tersedia: {file_path}")

    # Tabs Navigasi WordCloud
    tab_mentah, tab_bersih, tab_neg, tab_net, tab_pos = st.tabs([
        "Data Mentah", "Data Bersih", "Negatif", "Netral", "Positif"
    ])

    with tab_mentah:
        st.caption("Data dari kolom 'Teks Tweet' (Original)")
        generate_wc(" ".join(df['Teks Tweet'].dropna().astype(str)), 'cividis')

    with tab_bersih:
        st.caption("Data dari kolom 'Tweet_Final' (Preprocessed)")
        if 'Tweet_Final' in df.columns:
            generate_wc(" ".join(df['Tweet_Final'].dropna().astype(str)), 'viridis')
        else:
            st.warning("Kolom Tweet_Final tidak ada.")

    # MENGGUNAKAN FILE CSV WORDFREQ DI SINI
    with tab_neg:
        st.caption("Kata dominan sentimen NEGATIF (Sumber: WordFreq_Negatif.csv)")
        generate_wc_from_freq('model/WordFreq_Negatif.csv', 'Reds')

    with tab_net:
        st.caption("Kata dominan sentimen NETRAL (Sumber: WordFreq_Netral.csv)")
        generate_wc_from_freq('model/WordFreq_Netral.csv', 'Greys')

    with tab_pos:
        st.caption("Kata dominan sentimen POSITIF (Sumber: WordFreq_Positif.csv)")
        generate_wc_from_freq('model/WordFreq_Positif.csv', 'Greens')

    st.markdown("---")

    # ==============================================================================
    # 4. TOPIC MODELING
    # ==============================================================================
    st.subheader("📌 4. Topic Modeling (LDA) & Kata Kunci")
    st.write("Ekstraksi topik dominan dari hasil algoritma Latent Dirichlet Allocation (LDA).")

    path_lda = 'model/Hasil_Analisis_Topik_LDA.csv'
    if not os.path.exists(path_lda): path_lda = 'Hasil_Analisis_Topik_LDA.csv'

    if os.path.exists(path_lda):
        try:
            df_lda = pd.read_csv(path_lda)

            def parse_lda_string(text_data):
                data_items = []
                for word in str(text_data).split(','):
                    word = word.strip()
                    if word:
                        data_items.append({'Kata': word})

                df_res = pd.DataFrame(data_items)
                if not df_res.empty:
                    df_res['Bobot'] = range(len(df_res), 0, -1)
                    df_res = df_res.sort_values(by='Bobot', ascending=True)
                return df_res

            t_neg, t_net, t_pos = st.tabs(["🔴 Topik Negatif", "⚪ Topik Netral", "🟢 Topik Positif"])
            mapping = {'negatif': t_neg, 'netral': t_net, 'positif': t_pos}

            for sentimen, tab in mapping.items():
                with tab:
                    df_subset = df_lda[df_lda['Sentimen'].str.lower() == sentimen]

                    if df_subset.empty:
                        st.warning(f"Belum ada data topik untuk {sentimen}.")
                    else:
                        for idx, row in df_subset.iterrows():
                            topik_ke = row['Topik Ke']
                            df_chart = parse_lda_string(row['Kata Kunci'])

                            if not df_chart.empty:
                                fig = px.bar(
                                    df_chart, x='Bobot', y='Kata', orientation='h',
                                    title=f"<b>Topik {topik_ke}</b>",
                                    color='Bobot',
                                    color_continuous_scale='Reds' if sentimen == 'negatif' else 'Greys' if sentimen == 'netral' else 'Greens'
                                )
                                fig.update_layout(height=300, showlegend=False, xaxis_title=None, xaxis_visible=False)
                                st.plotly_chart(fig, use_container_width=True)
                                st.divider()
        except Exception as e:
            st.error(f"Gagal memproses data LDA: {e}")
    else:
        st.warning("⚠️ File 'Hasil_Analisis_Topik_LDA.csv' belum tersedia di folder model.")

    # ==============================================================================
    # 5. DATA EXPLORER & EVALUASI MODEL
    # ==============================================================================
    st.subheader("🔍 Data Explorer & Evaluasi Model")

    tab_data, tab_eval = st.tabs(["Data Explorer", "Tabel Performa (Evaluasi)"])

    # --- TAB 1: DATA EXPLORER ---
    with tab_data:
        col_f1, col_f2 = st.columns([1, 2])
        with col_f1: filter_label = st.selectbox("Filter Sentimen:", ['Semua', 'negatif', 'netral', 'positif'])
        with col_f2: search_keyword = st.text_input("Cari Tweet:", "")

        cols_available = [c for c in ['created_at', 'username', 'Teks Tweet', 'Label_Clean'] if c in df.columns]
        df_show = df[cols_available].copy()

        rename_map = {'created_at': 'Tanggal', 'username': 'Username', 'Label_Clean': 'Label'}
        df_show = df_show.rename(columns=rename_map)

        if filter_label != 'Semua' and 'Label' in df_show.columns:
            df_show = df_show[df_show['Label'] == filter_label]

        if search_keyword and 'Teks Tweet' in df_show.columns:
            df_show = df_show[df_show['Teks Tweet'].str.contains(search_keyword, case=False, na=False)]

        df_show.index = range(1, len(df_show) + 1)

        baris_per_halaman = 20
        total_data = len(df_show)
        total_halaman = math.ceil(total_data / baris_per_halaman)

        if total_data > 0:
            c_nav, c_stat = st.columns([1, 3])
            with c_nav:
                halaman = st.number_input("Halaman", min_value=1, max_value=max(1, total_halaman), step=1)
            with c_stat:
                st.write("")
                st.caption(f"Menampilkan **{total_data}** Data (Halaman {halaman} dari {total_halaman})")

            start_idx = (halaman - 1) * baris_per_halaman
            end_idx = start_idx + baris_per_halaman
            df_page = df_show.iloc[start_idx:end_idx]

            st.dataframe(df_page, use_container_width=True)
        else:
            st.warning("Data tidak ditemukan.")

    # --- TAB 2: TABEL EVALUASI & CONFUSION MATRIX ---
    with tab_eval:
        st.subheader("1. Tabel Performa (Classification Report)")
        st.markdown("""
        Metrik evaluasi model berdasarkan data testing (20%):
        * **Precision**: Ketepatan tebakan.
        * **Recall**: Kemampuan menemukan data yang relevan.
        * **F1-Score**: Rata-rata harmonis (Paling penting untuk data tidak seimbang).
        """)

        path_perf = 'model/Tabel_Performa_LSTM.csv'
        if not os.path.exists(path_perf): path_perf = 'Tabel_Performa_LSTM.csv'

        if os.path.exists(path_perf):
            try:
                df_perf = pd.read_csv(path_perf, index_col=0)
                st.table(
                    df_perf.style.highlight_max(axis=0, props='background-color: #FFEB3B; color: black; font-weight: bold')
                )
            except Exception as e:
                st.error(f"Gagal memuat tabel evaluasi: {e}")
        else:
            st.warning("⚠️ File 'Tabel_Performa_LSTM.csv' belum tersedia.")

        st.divider()

        st.subheader("2. Confusion Matrix")
        st.markdown("Visualisasi ini menunjukkan **detail kesalahan prediksi**. Sumbu Y adalah Label Asli, Sumbu X adalah Prediksi Model.")

        path_cm = 'model/Data_Confusion_Matrix.csv'
        if not os.path.exists(path_cm): path_cm = 'Data_Confusion_Matrix.csv'

        if os.path.exists(path_cm):
            try:
                df_cm_data = pd.read_csv(path_cm)

                if 'y_true' in df_cm_data.columns and 'y_pred' in df_cm_data.columns:
                    from sklearn.metrics import confusion_matrix

                    labels = ['Negatif', 'Netral', 'Positif']
                    cm = confusion_matrix(df_cm_data['y_true'], df_cm_data['y_pred'])

                    fig_cm = px.imshow(
                        cm,
                        text_auto=True,
                        labels=dict(x="Prediksi Model", y="Label Aktual (Asli)", color="Jumlah Data"),
                        x=labels,
                        y=labels,
                        color_continuous_scale='Blues',
                        aspect="auto"
                    )
                    fig_cm.update_layout(title="Confusion Matrix Heatmap")
                    st.plotly_chart(fig_cm, use_container_width=True)

                    total_benar = np.trace(cm)
                    total_data = np.sum(cm)
                    akurasi_cm = (total_benar / total_data) * 100
                    st.caption(f"💡 **Interpretasi:** Dari total **{total_data}** data testing, model berhasil menebak benar sebanyak **{total_benar}** data ({akurasi_cm:.2f}%).")

                else:
                    st.error("Format CSV Confusion Matrix salah. Harus ada kolom 'y_true' dan 'y_pred'.")
            except Exception as e:
                st.error(f"Gagal memproses Confusion Matrix: {e}")
        else:
            st.info("ℹ️ **Data Confusion Matrix belum tersedia.** Silakan jalankan kode penyimpanan `Data_Confusion_Matrix.csv` di Google Colab (Bagian Evaluasi).")