cleaning text commit

This commit is contained in:
najwa 2025-05-07 13:56:10 +07:00
commit 787a272fe3
1 changed files with 732 additions and 0 deletions

732
cleaning_text_collab.py Normal file
View File

@ -0,0 +1,732 @@
# -*- coding: utf-8 -*-
"""cleaning_text_collab.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1uScAXjTCOp9UdiTPMwH-y2Z5gbn7rWYV
"""
!pip install emoji
import pandas as pd
# Baca file Excel
df = pd.read_excel("clean_text_jaki.xlsx")
# Cek nama kolom yang tersedia
print(df.columns)
import pandas as pd
import re
import emoji
# Baca file Excel
file_path = "clean_text_jaki.xlsx" # Ganti dengan path file Anda
df = pd.read_excel(file_path)
# Fungsi untuk membersihkan teks
def clean_text(text):
if isinstance(text, str): # Pastikan input adalah string
text = text.lower() # Ubah menjadi huruf kecil
text = re.sub(r'[^\w\s]', '', text) # Hapus tanda baca
text = emoji.replace_emoji(text, replace='') # Hapus emoji atau stiker
return text
# Terapkan fungsi ke kolom "clean_text"
df["clean_text"] = df["clean_text"].apply(clean_text)
# Simpan kembali ke Excel
df.to_excel("cleaned_data_btg1.xlsx", index=False)
print("Pembersihan selesai, file disimpan sebagai cleaned_data.xlsx")
"""Grafik code GUI Streaamlit. Lanjutan code"""
from google.colab import files
uploaded = files.upload() # Pilih file Excel Anda
"""Load data excel ke dataframe"""
import pandas as pd
# Ganti 'data_sentimen_separated.xlsx' dengan nama file Anda
file_path = "/content/data_sentimen_sorted_new.xlsx"
# Membaca file Excel
df = pd.read_excel(file_path)
# Menampilkan 5 baris pertama
df.head()
"""memastikan kolom clean_text telah terisi"""
df = df.dropna(subset=['clean_text']) # Hapus baris kosong
texts = df['clean_text'].astype(str).tolist() # Konversi ke list string
"""Load data ke dataframe
Menghitung TF-IDF
"""
# # Inisialisasi TF-IDF Vectorizer
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(texts)
# # Konversi hasil ke DataFrame
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# # Menampilkan 5 baris pertama hasil TF-IDF
# tfidf_df.head()
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
# Inisialisasi TF-IDF Vectorizer dengan filter kata minimal 3 huruf
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b[A-Za-z]{3,}\b')
tfidf_matrix = vectorizer.fit_transform(texts)
# Konversi hasil ke DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# Menampilkan 5 baris pertama hasil TF-IDF
tfidf_df.head()
"""Menampilkan Histogram"""
from matplotlib import pyplot as plt
# Flatten semua nilai TF-IDF menjadi satu array
all_tfidf_values = tfidf_df.values.flatten()
# Plot histogram
plt.hist(all_tfidf_values, bins=50, log=True)
plt.title('Histogram Semua Nilai TF-IDF')
plt.xlabel('TF-IDF Value')
plt.ylabel('Frequency')
plt.show()
# tfidf_df['23'].plot(kind='hist', bins=50, log=True, title='23')
# plt.gca().spines[['top', 'right',]].set_visible(False)
print(tfidf_df.columns.tolist())
print(tfidf_df.describe())
print(tfidf_df.head())
"""Menyimpan hasil output TF-IDF excel"""
output_path = "/content/hasil_tfidf6.xlsx"
tfidf_df.to_excel(output_path, index=False)
print("Hasil TF-IDF berhasil disimpan di:", output_path)
"""memvisualisasikan 10 data tertinggi TF-IDF menjadi diagram"""
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
# Contoh vectorizer
vectorizer = TfidfVectorizer()
# Hitung rata-rata skor TF-IDF tiap kata
tfidf_mean = tfidf_df.mean().sort_values(ascending=False)
# Pilih 10 kata dengan nilai TF-IDF tertinggi
top_words = tfidf_mean.head(10)
# Membuat bar chart
plt.figure(figsize=(10,5))
sns.barplot(x=top_words.values, y=top_words.index, palette="viridis")
plt.xlabel("Skor TF-IDF")
plt.ylabel("Kata")
plt.title("10 Kata dengan Nilai TF-IDF Tertinggi")
plt.show()
import pandas as pd
# Gantilah 'nama_file.csv' dengan path file dataset Anda
data = pd.read_excel("data_sentimen_sorted_new.xlsx")
# Cek apakah dataset berhasil dimuat
print(data.head())
from sklearn.model_selection import train_test_split
X = data['clean_text'] # Gantilah 'clean_text' dengan kolom fitur di dataset
y = data['sentiment_label'] # Gantilah 'sentiment_label' dengan kolom target
print(X.head(), y.head()) # Cek apakah variabel sudah benar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape) # Pastikan data terbagi dengan benar
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
# Assuming 'X_train' and 'y_train' from previous cell are a pandas Series or DataFrame
# 1. Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# 2. Fit the vectorizer to the entire training data (X_train) and transform it
X_train_vec = vectorizer.fit_transform(X_train.astype(str)) # Convert to string explicitly
# 3. Now apply SMOTE to the vectorized training data and the corresponding labels
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_vec, y_train)
# 4. Print the shapes to verify
print(X_train_balanced.shape, y_train_balanced.shape) # Cek apakah SMOTE berhasil
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
# 1. Load dataset
data = pd.read_excel("data_sentimen_sorted_new.xlsx") # Ganti dengan nama dataset yang benar
# 2. Pastikan kolom teks dan label ada
print(data.head())
# 3. Konversi teks ke vektor numerik menggunakan TF-IDF
vectorizer = TfidfVectorizer()
data["clean_text"] = data["clean_text"].fillna('') # Replace NaN with empty string
x_tfidf = vectorizer.fit_transform(data["clean_text"])
# 4. Ubah label menjadi numerik jika masih string
le = LabelEncoder()
y_encoded = le.fit_transform(data["sentiment_label"]) # Ganti dengan kolom label yang benar
# 5. Bagi dataset menjadi train-test
# 🐛 Fixed: Changed 'X_tfidf' to 'x_tfidf' to use the correct variable name
X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y_encoded, test_size=0.2, random_state=42)
# 6. Terapkan SMOTE untuk menyeimbangkan data
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# 7. Simpan model dan vectorizer
model = MultinomialNB()
joblib.dump(model, "model_sentimen.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model dan vectorizer berhasil disimpan!")
"""Confussion Matrix"""
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE
# Pastikan jumlah data fitur dan label sama
assert len(tfidf_df) == len(df['sentiment_label']), "Jumlah baris pada TF-IDF dan label tidak cocok!"
# Konversi label ke angka jika masih berupa teks
le = LabelEncoder()
y = le.fit_transform(df['sentiment_label'])
X = tfidf_df # Fitur (TF-IDF matrix)
# Split data menjadi train dan test dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16, stratify=y)
# **SMOTE diterapkan pada training set**
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# **Chi-Square Feature Selection**
k_features = 1000 # Pilih 1000 fitur terbaik
chi2_selector = SelectKBest(chi2, k=k_features)
# Fit hanya pada training data
X_train_chi2 = chi2_selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_chi2 = chi2_selector.transform(X_test) # Transformasi data test sesuai fitur yang dipilih
# Inisialisasi model Naïve Bayes
model = MultinomialNB(alpha=0.5)
# Melatih model
model.fit(X_train_chi2, y_train_resampled)
# Prediksi
y_pred = model.predict(X_test_chi2)
# Evaluasi Model
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred, target_names=le.classes_))
# **Visualisasi Confusion Matrix**
cm = confusion_matrix(y_test, y_pred)
cm_labels = le.classes_ # Label kategori
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=cm_labels, yticklabels=cm_labels)
plt.xlabel("Prediksi")
plt.ylabel("Aktual")
plt.title("Confusion Matrix dengan Chi-Square Feature Selection")
plt.show()
"""Ambil skor chi square"""
# Ambil skor Chi-Square dan nama fitur
feature_scores = chi2_selector.scores_ # Skor Chi-Square untuk tiap fitur
feature_names = tfidf_df.columns # Nama fitur dari TF-IDF
# Buat DataFrame untuk skor Chi-Square
chi2_df = pd.DataFrame({"Feature": feature_names, "Chi2 Score": feature_scores})
# Urutkan berdasarkan skor tertinggi
top_chi2 = chi2_df.nlargest(10, "Chi2 Score")
# **Visualisasi 10 Kata dengan Skor Chi-Square Tertinggi**
plt.figure(figsize=(10, 5))
sns.barplot(x="Chi2 Score", y="Feature", data=top_chi2, palette="Blues_r")
plt.xlabel("Skor Chi-Square")
plt.ylabel("Kata")
plt.title("10 Kata dengan Skor Chi-Square Tertinggi")
plt.show()
!pip install imbalanced-learn
"""Distribusi SMOTE"""
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Pastikan jumlah data fitur dan label sama
assert len(tfidf_df) == len(df['sentiment_label']), "Jumlah baris pada TF-IDF dan label tidak cocok!"
# Konversi label ke angka jika masih berupa teks
le = LabelEncoder()
y = le.fit_transform(df['sentiment_label'])
X = tfidf_df # Fitur (TF-IDF matrix)
# Split data menjadi train dan test dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16, stratify=y)
# 📌 **Cek distribusi kelas sebelum SMOTE**
print("Distribusi sebelum SMOTE (Training Set):")
print(pd.Series(y_train).value_counts(normalize=True))
# 📌 **SMOTE diterapkan pada training set**
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# 📌 **Cek distribusi kelas setelah SMOTE**
print("\nDistribusi setelah SMOTE (Training Set):")
print(pd.Series(y_train_resampled).value_counts(normalize=True))
# 📌 **Cek distribusi kelas di testing set**
print("\nDistribusi kelas di Testing Set:")
print(pd.Series(y_test).value_counts(normalize=True))
# Inisialisasi model Naïve Bayes dengan alpha yang bisa di-tuning
model = MultinomialNB(alpha=0.5) # Coba ubah alpha untuk meningkatkan performa
# 📌 **Gunakan data hasil SMOTE untuk pelatihan model**
model.fit(X_train_resampled, y_train_resampled)
# Melakukan prediksi pada data testing
y_pred = model.predict(X_test)
# Evaluasi model
print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred, target_names=le.classes_))
# 📊 **Visualisasi Confusion Matrix**
cm = confusion_matrix(y_test, y_pred)
cm_labels = le.classes_ # Label kategori
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=cm_labels, yticklabels=cm_labels)
plt.xlabel("Prediksi")
plt.ylabel("Aktual")
plt.title("Confusion Matrix (Tanpa Chi-Square)")
plt.show()
"""Optional Code iam not sure about this"""
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# # 🐛 Fixed: Apply Chi-Square feature selection to X_test as well
# X_test_chi2 = chi2_selector.transform(X_test)
# # Prediksi model
# y_pred = model.predict(X_test_chi2)
# # Hitung metrik evaluasi
# report = classification_report(y_test, y_pred, output_dict=True)
# accuracy = accuracy_score(y_test, y_pred)
# # Visualisasi Confusion Matrix
# plt.figure(figsize=(6,5))
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues")
# plt.xlabel("Prediksi")
# plt.ylabel("Aktual")
# plt.title("Confusion Matrix")
# plt.show()
# # Visualisasi Akurasi dan Metrik Evaluasi
# metrics = ['precision', 'recall', 'f1-score']
# categories = list(report.keys())[:-3] # Ambil kategori kecuali avg/total
# plt.figure(figsize=(8,5))
# for metric in metrics:
# scores = [report[category][metric] for category in categories]
# plt.plot(categories, scores, marker='o', label=metric)
# plt.axhline(y=accuracy, color='r', linestyle='--', label=f'Akurasi ({accuracy:.2f})')
# plt.title("Evaluasi Model")
# plt.xlabel("Kategori")
# plt.ylabel("Skor")
# plt.legend()
# plt.show()
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import SMOTE
# ... (Your existing code for data loading, preprocessing, and splitting) ...
# **SMOTE diterapkan pada training set**
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# **Chi-Square Feature Selection**
# 🐛 Fixed: Apply Chi-square BEFORE SMOTE
k_features = 1000 # Pilih 1000 fitur terbaik
chi2_selector = SelectKBest(chi2, k=k_features)
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train) # Fit on original training data
# Apply SMOTE to the selected features
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_chi2, y_train)
# Transform test data using the same selector
X_test_chi2 = chi2_selector.transform(X_test)
# ... (Rest of your code for model training, prediction, and evaluation) ...
"""Install wordcloud"""
!pip install wordcloud
!pip install wordcloud matplotlib pandas
print(data.columns)
print(df[['sentiment_label', 'clean_text']].head()) # Cek data awal
print(df['clean_text'].isna().sum()) # Cek apakah ada nilai NaN
print(df['clean_text'].apply(lambda x: len(str(x).split())).describe()) # Cek jumlah kata
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
# 🔹 Load dataset dari Excel (pastikan path benar)
data = pd.read_excel("data_sentimen_sorted_cleaned.xlsx")
# 🔹 Cek apakah kolom 'sentiment_label' dan 'clean_text' ada di dataset
print(data.columns) # Untuk memastikan ada kolom yang benar
# 🔹 Fungsi untuk membuat WordCloud per kelas
stopwords = set(STOPWORDS)
def generate_wordcloud(data, label, title):
# Pilih teks berdasarkan label sentimen
text = " ".join(data[data['sentiment_label'] == label]['clean_text'].astype(str))
# Buat WordCloud
wordcloud = WordCloud(
width=1000,
height=1000,
background_color="white",
stopwords=stopwords
).generate(text)
# Tampilkan WordCloud
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(f"{title} - {label}")
plt.show()
# 🔹 Pastikan kolom yang dibutuhkan ada sebelum iterasi
if 'sentiment_label' in data.columns and 'clean_text' in data.columns:
for label in data['sentiment_label'].unique():
generate_wordcloud(data, label, "WordCloud Tanpa Chi-Square")
else:
print("Error: Kolom 'sentiment_label' atau 'clean_text' tidak ditemukan dalam dataset!")
# 🔹 Hitung jumlah data per kelas dengan 'sentiment_label'
class_counts = data['sentiment_label'].value_counts()
# 🔹 Buat Pie Chart dengan label sentimen
plt.figure(figsize=(8, 8)) # Ukuran lebih besar
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', colors=['lightblue', 'salmon', 'lightgreen'])
plt.title("Distribusi Sentimen Tanpa Chi-Square")
plt.axis("equal") # Pastikan pie chart berbentuk lingkaran sempurna
plt.show()
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
# 🔹 Gunakan kembali dataset setelah Chi-Square
chi_data = df[['sentiment_label', 'clean_text']].copy() # Simpan hanya kolom penting
# 🐛 Fixed: Re-encode 'sentiment_label' based on values in 'chi_data'
# This ensures the lengths match.
le = LabelEncoder()
chi_data['sentiment_label_encoded'] = le.fit_transform(chi_data['sentiment_label'])
y_chi = chi_data['sentiment_label_encoded']
chi_data['sentiment_label'] = le.inverse_transform(y_chi) # Konversi angka ke label asli
# 🔹 Cek apakah kolom yang dibutuhkan tersedia
if 'sentiment_label' in chi_data.columns and 'clean_text' in chi_data.columns:
stopwords = set(STOPWORDS)
# 🔹 Fungsi untuk membuat WordCloud per kelas
def generate_wordcloud(data, label, title):
text = " ".join(data[data['sentiment_label'] == label]['clean_text'].astype(str))
wordcloud = WordCloud(
width=1000,
height=1000,
background_color="white",
stopwords=stopwords
).generate(text)
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(f"{title} - {label}")
plt.show()
# 🔹 Buat WordCloud untuk setiap label sentimen
for label in chi_data['sentiment_label'].unique():
generate_wordcloud(chi_data, label, "WordCloud Dengan Chi-Square")
# 🔹 Hitung jumlah data per kelas setelah Chi-Square
chi_class_counts = chi_data['sentiment_label'].value_counts()
# 🔹 Buat Pie Chart dengan label sentimen
plt.figure(figsize=(8, 8))
plt.pie(chi_class_counts, labels=chi_class_counts.index, autopct='%1.1f%%', colors=['lightblue', 'salmon', 'lightgreen'])
plt.title("Distribusi Sentimen Dengan Chi-Square")
plt.axis("equal") # Pastikan pie chart berbentuk lingkaran sempurna
plt.show()
else:
print("Error: Kolom 'sentiment_label' atau 'clean_text' tidak ditemukan dalam dataset!")
from sklearn.ensemble import RandomForestClassifier
X = tfidf_df # Features (TF-IDF values)
y = df['sentiment_label']
# Inisialisasi model Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
# Split data into training and testing sets (if not already done)
#from sklearn.model_selection import train_test_split # Import if not already imported
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Melatih model, using X_train instead of X_train_tfidf
model_rf.fit(X_train, y_train)
# Prediksi, using X_test instead of X_test_tfidf
y_pred_rf = model_rf.predict(X_test)
# Evaluasi model
print("Akurasi Random Forest:", accuracy_score(y_test, y_pred_rf))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred_rf))
"""Install Streamlit"""
!pip install streamlit
!npm install -g localtunnel
!pip show streamlit
!pip install --upgrade streamlit
!pip install streamlit pyngrok
!find /content -name "app.py"
!cat /content/app.py
import pandas as pd
df = pd.read_excel("data_sentimen_sorted_new.xlsx") # Ganti dengan file Anda
print(f"Jumlah baris DataFrame: {len(df)}")
# Cek jumlah baris di awal
print(f"Jumlah baris DataFrame: {len(df)}")
# Contoh data tambahan yang ingin dimasukkan
new_values = [2356] # Data yang ingin dimasukkan (ganti dengan variabel Anda)
# Cek apakah panjang data baru sesuai dengan jumlah baris DataFrame
if len(new_values) == len(df):
df["new_column"] = new_values
else:
print(f"⚠️ Jumlah baris tidak sesuai! DataFrame: {len(df)}, Data Baru: {len(new_values)}")
# Jika new_values lebih panjang, potong agar sesuai
if len(new_values) > len(df):
new_values = new_values[:len(df)]
# Jika new_values lebih pendek, tambahkan NaN agar panjangnya sesuai
else:
new_values = list(new_values) + [np.nan] * (len(df) - len(new_values))
# Masukkan data setelah penyesuaian
df["new_column"] = new_values
# Cek kembali setelah penyesuaian
print(f"✅ Data berhasil dimasukkan, jumlah baris akhir: {len(df)}")
!pip install streamlit pyngrok openpyxl
# Commented out IPython magic to ensure Python compatibility.
# %%writefile app.py
# import streamlit as st
# import pandas as pd
# import matplotlib.pyplot as plt
# import numpy as np
# import seaborn as sns
# from wordcloud import WordCloud, STOPWORDS
# from scipy.stats import chi2_contingency
#
# st.title("📊 Aplikasi Analisis Data & Visualisasi Sentimen")
#
# uploaded_file = st.file_uploader("Upload dataset CSV atau Excel (Maksimal 50MB)", type=["csv", "xlsx"])
#
# MAX_FILE_SIZE_MB = 50
#
# if uploaded_file:
# file_size_mb = uploaded_file.size / (1024 * 1024)
#
# if file_size_mb > MAX_FILE_SIZE_MB:
# st.error(f"Ukuran file terlalu besar! Maksimum {MAX_FILE_SIZE_MB}MB.")
# else:
# try:
# if uploaded_file.name.endswith('.csv'):
# df = pd.read_csv(uploaded_file, encoding="ISO-8859-1")
# else:
# df = pd.read_excel(uploaded_file)
#
# st.success(f"✅ File berhasil diunggah ({file_size_mb:.2f} MB).")
# st.write("📌 **Data yang diunggah:**")
# st.dataframe(df.head())
#
# if "clean_text" in df.columns and "sentiment_label" in df.columns:
# tab1, tab2, tab3 = st.tabs(["Tanpa Chi-Square", "Dengan Chi-Square", "Word Cloud"])
#
# with tab1:
# st.subheader("📊 Distribusi Sentimen Tanpa Chi-Square")
# sentiment_counts = df["sentiment_label"].value_counts()
#
# fig, ax = plt.subplots(figsize=(8, 4))
# sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, ax=ax, palette="Set2")
# ax.set_title("Distribusi Sentimen Tanpa Chi-Square")
# ax.set_xlabel("Sentiment Label")
# ax.set_ylabel("Frekuensi")
# st.pyplot(fig)
#
#
# with tab2:
# st.subheader("📊 Analisis Chi-Square")
#
# contingency_table = pd.crosstab(df["sentiment_label"], columns="count")
# chi2, p, dof, expected = chi2_contingency(contingency_table)
#
# # DataFrame untuk perbandingan observed vs expected
# observed = contingency_table["count"]
# expected = pd.Series(np.round(expected).flatten(), index=contingency_table.index)
#
# chi_df = pd.DataFrame({
# "Observed": observed,
# "Expected": expected
# })
#
# st.write("📌 **Hasil distribusi sentimen chi-square:**")
# st.dataframe(chi_df)
#
# # Bar Chart
# fig, ax = plt.subplots(figsize=(8, 4))
# # chi_df.plot(kind="bar", ax=ax)
# chi_df[["Observed"]].plot(kind="bar", ax=ax, legend=True, color="skyblue")
# plt.title("Distribusi sentimen dengan chi-square")
# plt.xlabel("Sentiment Label")
# plt.ylabel("Frekuensi")
# plt.xticks(rotation=0)
# st.pyplot(fig)
#
# st.write(f"📌 **Nilai Chi-Square**: {chi2:.2f}")
# st.write(f"📌 **p-value**: {p:.4f} (Jika < 0.05, berarti ada perbedaan signifikan)")
#
#
#
# with tab3:
# st.subheader("☁️ WordCloud per Sentimen")
# stopwords = set(STOPWORDS)
#
# for label in df["sentiment_label"].unique():
# text = " ".join(df[df["sentiment_label"] == label]["clean_text"].dropna())
# wordcloud = WordCloud(width=800, height=400, stopwords=stopwords, background_color="white").generate(text)
# st.image(wordcloud.to_array(), caption=f"WordCloud - {label}")
#
# else:
# st.error("❌ Dataset harus memiliki kolom 'clean_text' dan 'sentiment_label'!")
#
# except Exception as e:
# st.error(f"Terjadi kesalahan saat membaca file: {e}")
from pyngrok import ngrok
import time
# Ganti dengan token ngrok Anda
!ngrok authtoken 2uYp3Yv9sSFl9B6VT5dXt1qXSXS_3xG2usiFQQiRo8ca2y5Bh
# Pastikan kamu sudah menulis app.py sebelumnya
!streamlit run app.py &
time.sleep(10)
# Tunggu sebentar lalu expose ke ngrok
public_url = ngrok.connect(8501)
print(f"Aplikasi bisa diakses publik via: {public_url}")
!ngrok http 8501 & streamlit run app.py --server.port 8501 --server.enableCORS false --server.headless true > log.txt 2>&1 &