cleaning text commit
This commit is contained in:
commit
787a272fe3
|
@ -0,0 +1,732 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""cleaning_text_collab.ipynb
|
||||
|
||||
Automatically generated by Colab.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1uScAXjTCOp9UdiTPMwH-y2Z5gbn7rWYV
|
||||
"""
|
||||
|
||||
!pip install emoji
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Baca file Excel
|
||||
df = pd.read_excel("clean_text_jaki.xlsx")
|
||||
|
||||
# Cek nama kolom yang tersedia
|
||||
print(df.columns)
|
||||
|
||||
import pandas as pd
|
||||
import re
|
||||
import emoji
|
||||
|
||||
# Baca file Excel
|
||||
file_path = "clean_text_jaki.xlsx" # Ganti dengan path file Anda
|
||||
df = pd.read_excel(file_path)
|
||||
|
||||
# Fungsi untuk membersihkan teks
|
||||
def clean_text(text):
|
||||
if isinstance(text, str): # Pastikan input adalah string
|
||||
text = text.lower() # Ubah menjadi huruf kecil
|
||||
text = re.sub(r'[^\w\s]', '', text) # Hapus tanda baca
|
||||
text = emoji.replace_emoji(text, replace='') # Hapus emoji atau stiker
|
||||
return text
|
||||
|
||||
# Terapkan fungsi ke kolom "clean_text"
|
||||
df["clean_text"] = df["clean_text"].apply(clean_text)
|
||||
|
||||
# Simpan kembali ke Excel
|
||||
df.to_excel("cleaned_data_btg1.xlsx", index=False)
|
||||
print("Pembersihan selesai, file disimpan sebagai cleaned_data.xlsx")
|
||||
|
||||
"""Grafik code GUI Streaamlit. Lanjutan code"""
|
||||
|
||||
from google.colab import files
|
||||
uploaded = files.upload() # Pilih file Excel Anda
|
||||
|
||||
"""Load data excel ke dataframe"""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Ganti 'data_sentimen_separated.xlsx' dengan nama file Anda
|
||||
file_path = "/content/data_sentimen_sorted_new.xlsx"
|
||||
|
||||
# Membaca file Excel
|
||||
df = pd.read_excel(file_path)
|
||||
|
||||
# Menampilkan 5 baris pertama
|
||||
df.head()
|
||||
|
||||
"""memastikan kolom clean_text telah terisi"""
|
||||
|
||||
df = df.dropna(subset=['clean_text']) # Hapus baris kosong
|
||||
texts = df['clean_text'].astype(str).tolist() # Konversi ke list string
|
||||
|
||||
"""Load data ke dataframe
|
||||
|
||||
Menghitung TF-IDF
|
||||
"""
|
||||
|
||||
# # Inisialisasi TF-IDF Vectorizer
|
||||
# vectorizer = TfidfVectorizer()
|
||||
# tfidf_matrix = vectorizer.fit_transform(texts)
|
||||
|
||||
# # Konversi hasil ke DataFrame
|
||||
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
||||
|
||||
# # Menampilkan 5 baris pertama hasil TF-IDF
|
||||
# tfidf_df.head()
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
|
||||
# Inisialisasi TF-IDF Vectorizer dengan filter kata minimal 3 huruf
|
||||
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b[A-Za-z]{3,}\b')
|
||||
tfidf_matrix = vectorizer.fit_transform(texts)
|
||||
|
||||
# Konversi hasil ke DataFrame
|
||||
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
||||
|
||||
# Menampilkan 5 baris pertama hasil TF-IDF
|
||||
tfidf_df.head()
|
||||
|
||||
"""Menampilkan Histogram"""
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
# Flatten semua nilai TF-IDF menjadi satu array
|
||||
all_tfidf_values = tfidf_df.values.flatten()
|
||||
|
||||
# Plot histogram
|
||||
plt.hist(all_tfidf_values, bins=50, log=True)
|
||||
plt.title('Histogram Semua Nilai TF-IDF')
|
||||
plt.xlabel('TF-IDF Value')
|
||||
plt.ylabel('Frequency')
|
||||
plt.show()
|
||||
|
||||
# tfidf_df['23'].plot(kind='hist', bins=50, log=True, title='23')
|
||||
|
||||
# plt.gca().spines[['top', 'right',]].set_visible(False)
|
||||
|
||||
print(tfidf_df.columns.tolist())
|
||||
|
||||
print(tfidf_df.describe())
|
||||
print(tfidf_df.head())
|
||||
|
||||
"""Menyimpan hasil output TF-IDF excel"""
|
||||
|
||||
output_path = "/content/hasil_tfidf6.xlsx"
|
||||
tfidf_df.to_excel(output_path, index=False)
|
||||
|
||||
print("Hasil TF-IDF berhasil disimpan di:", output_path)
|
||||
|
||||
"""memvisualisasikan 10 data tertinggi TF-IDF menjadi diagram"""
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
# Contoh vectorizer
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
|
||||
# Hitung rata-rata skor TF-IDF tiap kata
|
||||
tfidf_mean = tfidf_df.mean().sort_values(ascending=False)
|
||||
|
||||
# Pilih 10 kata dengan nilai TF-IDF tertinggi
|
||||
top_words = tfidf_mean.head(10)
|
||||
|
||||
# Membuat bar chart
|
||||
plt.figure(figsize=(10,5))
|
||||
sns.barplot(x=top_words.values, y=top_words.index, palette="viridis")
|
||||
|
||||
plt.xlabel("Skor TF-IDF")
|
||||
plt.ylabel("Kata")
|
||||
plt.title("10 Kata dengan Nilai TF-IDF Tertinggi")
|
||||
plt.show()
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Gantilah 'nama_file.csv' dengan path file dataset Anda
|
||||
data = pd.read_excel("data_sentimen_sorted_new.xlsx")
|
||||
|
||||
# Cek apakah dataset berhasil dimuat
|
||||
print(data.head())
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
X = data['clean_text'] # Gantilah 'clean_text' dengan kolom fitur di dataset
|
||||
y = data['sentiment_label'] # Gantilah 'sentiment_label' dengan kolom target
|
||||
|
||||
print(X.head(), y.head()) # Cek apakah variabel sudah benar
|
||||
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||
|
||||
print(X_train.shape, y_train.shape) # Pastikan data terbagi dengan benar
|
||||
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
# Assuming 'X_train' and 'y_train' from previous cell are a pandas Series or DataFrame
|
||||
# 1. Initialize the TF-IDF vectorizer
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
# 2. Fit the vectorizer to the entire training data (X_train) and transform it
|
||||
X_train_vec = vectorizer.fit_transform(X_train.astype(str)) # Convert to string explicitly
|
||||
|
||||
# 3. Now apply SMOTE to the vectorized training data and the corresponding labels
|
||||
smote = SMOTE(sampling_strategy="auto", random_state=42)
|
||||
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_vec, y_train)
|
||||
|
||||
# 4. Print the shapes to verify
|
||||
print(X_train_balanced.shape, y_train_balanced.shape) # Cek apakah SMOTE berhasil
|
||||
|
||||
import joblib
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
# 1. Load dataset
|
||||
data = pd.read_excel("data_sentimen_sorted_new.xlsx") # Ganti dengan nama dataset yang benar
|
||||
|
||||
# 2. Pastikan kolom teks dan label ada
|
||||
print(data.head())
|
||||
|
||||
# 3. Konversi teks ke vektor numerik menggunakan TF-IDF
|
||||
vectorizer = TfidfVectorizer()
|
||||
data["clean_text"] = data["clean_text"].fillna('') # Replace NaN with empty string
|
||||
x_tfidf = vectorizer.fit_transform(data["clean_text"])
|
||||
|
||||
|
||||
# 4. Ubah label menjadi numerik jika masih string
|
||||
le = LabelEncoder()
|
||||
y_encoded = le.fit_transform(data["sentiment_label"]) # Ganti dengan kolom label yang benar
|
||||
|
||||
# 5. Bagi dataset menjadi train-test
|
||||
# 🐛 Fixed: Changed 'X_tfidf' to 'x_tfidf' to use the correct variable name
|
||||
X_train, X_test, y_train, y_test = train_test_split(x_tfidf, y_encoded, test_size=0.2, random_state=42)
|
||||
|
||||
# 6. Terapkan SMOTE untuk menyeimbangkan data
|
||||
smote = SMOTE(sampling_strategy="auto", random_state=42)
|
||||
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
|
||||
|
||||
# 7. Simpan model dan vectorizer
|
||||
model = MultinomialNB()
|
||||
joblib.dump(model, "model_sentimen.pkl")
|
||||
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
|
||||
|
||||
print("Model dan vectorizer berhasil disimpan!")
|
||||
|
||||
"""Confussion Matrix"""
|
||||
|
||||
# Import library
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from imblearn.over_sampling import SMOTE
|
||||
|
||||
# Pastikan jumlah data fitur dan label sama
|
||||
assert len(tfidf_df) == len(df['sentiment_label']), "Jumlah baris pada TF-IDF dan label tidak cocok!"
|
||||
|
||||
# Konversi label ke angka jika masih berupa teks
|
||||
le = LabelEncoder()
|
||||
y = le.fit_transform(df['sentiment_label'])
|
||||
|
||||
X = tfidf_df # Fitur (TF-IDF matrix)
|
||||
|
||||
# Split data menjadi train dan test dengan stratifikasi
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16, stratify=y)
|
||||
|
||||
# **SMOTE diterapkan pada training set**
|
||||
smote = SMOTE(random_state=42)
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
||||
|
||||
# **Chi-Square Feature Selection**
|
||||
k_features = 1000 # Pilih 1000 fitur terbaik
|
||||
chi2_selector = SelectKBest(chi2, k=k_features)
|
||||
|
||||
# Fit hanya pada training data
|
||||
X_train_chi2 = chi2_selector.fit_transform(X_train_resampled, y_train_resampled)
|
||||
X_test_chi2 = chi2_selector.transform(X_test) # Transformasi data test sesuai fitur yang dipilih
|
||||
|
||||
# Inisialisasi model Naïve Bayes
|
||||
model = MultinomialNB(alpha=0.5)
|
||||
|
||||
# Melatih model
|
||||
model.fit(X_train_chi2, y_train_resampled)
|
||||
|
||||
# Prediksi
|
||||
y_pred = model.predict(X_test_chi2)
|
||||
|
||||
# Evaluasi Model
|
||||
print("Akurasi:", accuracy_score(y_test, y_pred))
|
||||
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred, target_names=le.classes_))
|
||||
|
||||
# **Visualisasi Confusion Matrix**
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
cm_labels = le.classes_ # Label kategori
|
||||
|
||||
plt.figure(figsize=(6, 5))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=cm_labels, yticklabels=cm_labels)
|
||||
plt.xlabel("Prediksi")
|
||||
plt.ylabel("Aktual")
|
||||
plt.title("Confusion Matrix dengan Chi-Square Feature Selection")
|
||||
plt.show()
|
||||
|
||||
"""Ambil skor chi square"""
|
||||
|
||||
# Ambil skor Chi-Square dan nama fitur
|
||||
feature_scores = chi2_selector.scores_ # Skor Chi-Square untuk tiap fitur
|
||||
feature_names = tfidf_df.columns # Nama fitur dari TF-IDF
|
||||
|
||||
# Buat DataFrame untuk skor Chi-Square
|
||||
chi2_df = pd.DataFrame({"Feature": feature_names, "Chi2 Score": feature_scores})
|
||||
|
||||
# Urutkan berdasarkan skor tertinggi
|
||||
top_chi2 = chi2_df.nlargest(10, "Chi2 Score")
|
||||
|
||||
# **Visualisasi 10 Kata dengan Skor Chi-Square Tertinggi**
|
||||
plt.figure(figsize=(10, 5))
|
||||
sns.barplot(x="Chi2 Score", y="Feature", data=top_chi2, palette="Blues_r")
|
||||
plt.xlabel("Skor Chi-Square")
|
||||
plt.ylabel("Kata")
|
||||
plt.title("10 Kata dengan Skor Chi-Square Tertinggi")
|
||||
plt.show()
|
||||
|
||||
!pip install imbalanced-learn
|
||||
|
||||
"""Distribusi SMOTE"""
|
||||
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from imblearn.over_sampling import SMOTE
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Pastikan jumlah data fitur dan label sama
|
||||
assert len(tfidf_df) == len(df['sentiment_label']), "Jumlah baris pada TF-IDF dan label tidak cocok!"
|
||||
|
||||
# Konversi label ke angka jika masih berupa teks
|
||||
le = LabelEncoder()
|
||||
y = le.fit_transform(df['sentiment_label'])
|
||||
|
||||
X = tfidf_df # Fitur (TF-IDF matrix)
|
||||
|
||||
# Split data menjadi train dan test dengan stratifikasi
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16, stratify=y)
|
||||
|
||||
# 📌 **Cek distribusi kelas sebelum SMOTE**
|
||||
print("Distribusi sebelum SMOTE (Training Set):")
|
||||
print(pd.Series(y_train).value_counts(normalize=True))
|
||||
|
||||
# 📌 **SMOTE diterapkan pada training set**
|
||||
smote = SMOTE(random_state=42)
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
||||
|
||||
# 📌 **Cek distribusi kelas setelah SMOTE**
|
||||
print("\nDistribusi setelah SMOTE (Training Set):")
|
||||
print(pd.Series(y_train_resampled).value_counts(normalize=True))
|
||||
|
||||
# 📌 **Cek distribusi kelas di testing set**
|
||||
print("\nDistribusi kelas di Testing Set:")
|
||||
print(pd.Series(y_test).value_counts(normalize=True))
|
||||
|
||||
# Inisialisasi model Naïve Bayes dengan alpha yang bisa di-tuning
|
||||
model = MultinomialNB(alpha=0.5) # Coba ubah alpha untuk meningkatkan performa
|
||||
|
||||
# 📌 **Gunakan data hasil SMOTE untuk pelatihan model**
|
||||
model.fit(X_train_resampled, y_train_resampled)
|
||||
|
||||
# Melakukan prediksi pada data testing
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
# Evaluasi model
|
||||
print("\nAkurasi:", accuracy_score(y_test, y_pred))
|
||||
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred, target_names=le.classes_))
|
||||
|
||||
# 📊 **Visualisasi Confusion Matrix**
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
cm_labels = le.classes_ # Label kategori
|
||||
|
||||
plt.figure(figsize=(6, 5))
|
||||
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=cm_labels, yticklabels=cm_labels)
|
||||
plt.xlabel("Prediksi")
|
||||
plt.ylabel("Aktual")
|
||||
plt.title("Confusion Matrix (Tanpa Chi-Square)")
|
||||
plt.show()
|
||||
|
||||
"""Optional Code iam not sure about this"""
|
||||
|
||||
# import matplotlib.pyplot as plt
|
||||
# import seaborn as sns
|
||||
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
|
||||
|
||||
# # 🐛 Fixed: Apply Chi-Square feature selection to X_test as well
|
||||
# X_test_chi2 = chi2_selector.transform(X_test)
|
||||
|
||||
# # Prediksi model
|
||||
# y_pred = model.predict(X_test_chi2)
|
||||
|
||||
# # Hitung metrik evaluasi
|
||||
# report = classification_report(y_test, y_pred, output_dict=True)
|
||||
# accuracy = accuracy_score(y_test, y_pred)
|
||||
|
||||
# # Visualisasi Confusion Matrix
|
||||
# plt.figure(figsize=(6,5))
|
||||
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues")
|
||||
# plt.xlabel("Prediksi")
|
||||
# plt.ylabel("Aktual")
|
||||
# plt.title("Confusion Matrix")
|
||||
# plt.show()
|
||||
|
||||
# # Visualisasi Akurasi dan Metrik Evaluasi
|
||||
# metrics = ['precision', 'recall', 'f1-score']
|
||||
# categories = list(report.keys())[:-3] # Ambil kategori kecuali avg/total
|
||||
|
||||
# plt.figure(figsize=(8,5))
|
||||
# for metric in metrics:
|
||||
# scores = [report[category][metric] for category in categories]
|
||||
# plt.plot(categories, scores, marker='o', label=metric)
|
||||
|
||||
# plt.axhline(y=accuracy, color='r', linestyle='--', label=f'Akurasi ({accuracy:.2f})')
|
||||
# plt.title("Evaluasi Model")
|
||||
# plt.xlabel("Kategori")
|
||||
# plt.ylabel("Skor")
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
# Import library
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from imblearn.over_sampling import SMOTE
|
||||
|
||||
# ... (Your existing code for data loading, preprocessing, and splitting) ...
|
||||
|
||||
# **SMOTE diterapkan pada training set**
|
||||
smote = SMOTE(random_state=42)
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
|
||||
|
||||
# **Chi-Square Feature Selection**
|
||||
# 🐛 Fixed: Apply Chi-square BEFORE SMOTE
|
||||
k_features = 1000 # Pilih 1000 fitur terbaik
|
||||
chi2_selector = SelectKBest(chi2, k=k_features)
|
||||
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train) # Fit on original training data
|
||||
|
||||
# Apply SMOTE to the selected features
|
||||
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_chi2, y_train)
|
||||
|
||||
# Transform test data using the same selector
|
||||
X_test_chi2 = chi2_selector.transform(X_test)
|
||||
|
||||
# ... (Rest of your code for model training, prediction, and evaluation) ...
|
||||
|
||||
"""Install wordcloud"""
|
||||
|
||||
!pip install wordcloud
|
||||
!pip install wordcloud matplotlib pandas
|
||||
print(data.columns)
|
||||
|
||||
print(df[['sentiment_label', 'clean_text']].head()) # Cek data awal
|
||||
print(df['clean_text'].isna().sum()) # Cek apakah ada nilai NaN
|
||||
print(df['clean_text'].apply(lambda x: len(str(x).split())).describe()) # Cek jumlah kata
|
||||
|
||||
import pandas as pd
|
||||
from wordcloud import WordCloud, STOPWORDS
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 🔹 Load dataset dari Excel (pastikan path benar)
|
||||
data = pd.read_excel("data_sentimen_sorted_cleaned.xlsx")
|
||||
|
||||
# 🔹 Cek apakah kolom 'sentiment_label' dan 'clean_text' ada di dataset
|
||||
print(data.columns) # Untuk memastikan ada kolom yang benar
|
||||
|
||||
# 🔹 Fungsi untuk membuat WordCloud per kelas
|
||||
stopwords = set(STOPWORDS)
|
||||
|
||||
def generate_wordcloud(data, label, title):
|
||||
# Pilih teks berdasarkan label sentimen
|
||||
text = " ".join(data[data['sentiment_label'] == label]['clean_text'].astype(str))
|
||||
|
||||
# Buat WordCloud
|
||||
wordcloud = WordCloud(
|
||||
width=1000,
|
||||
height=1000,
|
||||
background_color="white",
|
||||
stopwords=stopwords
|
||||
).generate(text)
|
||||
|
||||
# Tampilkan WordCloud
|
||||
plt.figure(figsize=(8, 4))
|
||||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.title(f"{title} - {label}")
|
||||
plt.show()
|
||||
|
||||
# 🔹 Pastikan kolom yang dibutuhkan ada sebelum iterasi
|
||||
if 'sentiment_label' in data.columns and 'clean_text' in data.columns:
|
||||
for label in data['sentiment_label'].unique():
|
||||
generate_wordcloud(data, label, "WordCloud Tanpa Chi-Square")
|
||||
else:
|
||||
print("Error: Kolom 'sentiment_label' atau 'clean_text' tidak ditemukan dalam dataset!")
|
||||
|
||||
# 🔹 Hitung jumlah data per kelas dengan 'sentiment_label'
|
||||
class_counts = data['sentiment_label'].value_counts()
|
||||
|
||||
# 🔹 Buat Pie Chart dengan label sentimen
|
||||
plt.figure(figsize=(8, 8)) # Ukuran lebih besar
|
||||
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', colors=['lightblue', 'salmon', 'lightgreen'])
|
||||
plt.title("Distribusi Sentimen Tanpa Chi-Square")
|
||||
plt.axis("equal") # Pastikan pie chart berbentuk lingkaran sempurna
|
||||
plt.show()
|
||||
|
||||
import pandas as pd
|
||||
from wordcloud import WordCloud, STOPWORDS
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# 🔹 Gunakan kembali dataset setelah Chi-Square
|
||||
chi_data = df[['sentiment_label', 'clean_text']].copy() # Simpan hanya kolom penting
|
||||
|
||||
# 🐛 Fixed: Re-encode 'sentiment_label' based on values in 'chi_data'
|
||||
# This ensures the lengths match.
|
||||
le = LabelEncoder()
|
||||
chi_data['sentiment_label_encoded'] = le.fit_transform(chi_data['sentiment_label'])
|
||||
y_chi = chi_data['sentiment_label_encoded']
|
||||
chi_data['sentiment_label'] = le.inverse_transform(y_chi) # Konversi angka ke label asli
|
||||
|
||||
# 🔹 Cek apakah kolom yang dibutuhkan tersedia
|
||||
if 'sentiment_label' in chi_data.columns and 'clean_text' in chi_data.columns:
|
||||
stopwords = set(STOPWORDS)
|
||||
|
||||
# 🔹 Fungsi untuk membuat WordCloud per kelas
|
||||
def generate_wordcloud(data, label, title):
|
||||
text = " ".join(data[data['sentiment_label'] == label]['clean_text'].astype(str))
|
||||
|
||||
wordcloud = WordCloud(
|
||||
width=1000,
|
||||
height=1000,
|
||||
background_color="white",
|
||||
stopwords=stopwords
|
||||
).generate(text)
|
||||
|
||||
plt.figure(figsize=(8, 4))
|
||||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.title(f"{title} - {label}")
|
||||
plt.show()
|
||||
|
||||
# 🔹 Buat WordCloud untuk setiap label sentimen
|
||||
for label in chi_data['sentiment_label'].unique():
|
||||
generate_wordcloud(chi_data, label, "WordCloud Dengan Chi-Square")
|
||||
|
||||
# 🔹 Hitung jumlah data per kelas setelah Chi-Square
|
||||
chi_class_counts = chi_data['sentiment_label'].value_counts()
|
||||
|
||||
# 🔹 Buat Pie Chart dengan label sentimen
|
||||
plt.figure(figsize=(8, 8))
|
||||
plt.pie(chi_class_counts, labels=chi_class_counts.index, autopct='%1.1f%%', colors=['lightblue', 'salmon', 'lightgreen'])
|
||||
plt.title("Distribusi Sentimen Dengan Chi-Square")
|
||||
plt.axis("equal") # Pastikan pie chart berbentuk lingkaran sempurna
|
||||
plt.show()
|
||||
|
||||
else:
|
||||
print("Error: Kolom 'sentiment_label' atau 'clean_text' tidak ditemukan dalam dataset!")
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
X = tfidf_df # Features (TF-IDF values)
|
||||
y = df['sentiment_label']
|
||||
# Inisialisasi model Random Forest
|
||||
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||
|
||||
# Split data into training and testing sets (if not already done)
|
||||
#from sklearn.model_selection import train_test_split # Import if not already imported
|
||||
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||
|
||||
# Melatih model, using X_train instead of X_train_tfidf
|
||||
model_rf.fit(X_train, y_train)
|
||||
|
||||
# Prediksi, using X_test instead of X_test_tfidf
|
||||
y_pred_rf = model_rf.predict(X_test)
|
||||
|
||||
# Evaluasi model
|
||||
print("Akurasi Random Forest:", accuracy_score(y_test, y_pred_rf))
|
||||
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred_rf))
|
||||
|
||||
"""Install Streamlit"""
|
||||
|
||||
!pip install streamlit
|
||||
!npm install -g localtunnel
|
||||
!pip show streamlit
|
||||
|
||||
!pip install --upgrade streamlit
|
||||
|
||||
!pip install streamlit pyngrok
|
||||
|
||||
!find /content -name "app.py"
|
||||
|
||||
!cat /content/app.py
|
||||
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_excel("data_sentimen_sorted_new.xlsx") # Ganti dengan file Anda
|
||||
|
||||
print(f"Jumlah baris DataFrame: {len(df)}")
|
||||
|
||||
# Cek jumlah baris di awal
|
||||
print(f"Jumlah baris DataFrame: {len(df)}")
|
||||
|
||||
# Contoh data tambahan yang ingin dimasukkan
|
||||
new_values = [2356] # Data yang ingin dimasukkan (ganti dengan variabel Anda)
|
||||
|
||||
# Cek apakah panjang data baru sesuai dengan jumlah baris DataFrame
|
||||
if len(new_values) == len(df):
|
||||
df["new_column"] = new_values
|
||||
else:
|
||||
print(f"⚠️ Jumlah baris tidak sesuai! DataFrame: {len(df)}, Data Baru: {len(new_values)}")
|
||||
|
||||
# Jika new_values lebih panjang, potong agar sesuai
|
||||
if len(new_values) > len(df):
|
||||
new_values = new_values[:len(df)]
|
||||
|
||||
# Jika new_values lebih pendek, tambahkan NaN agar panjangnya sesuai
|
||||
else:
|
||||
new_values = list(new_values) + [np.nan] * (len(df) - len(new_values))
|
||||
|
||||
# Masukkan data setelah penyesuaian
|
||||
df["new_column"] = new_values
|
||||
|
||||
# Cek kembali setelah penyesuaian
|
||||
print(f"✅ Data berhasil dimasukkan, jumlah baris akhir: {len(df)}")
|
||||
|
||||
!pip install streamlit pyngrok openpyxl
|
||||
|
||||
# Commented out IPython magic to ensure Python compatibility.
|
||||
# %%writefile app.py
|
||||
# import streamlit as st
|
||||
# import pandas as pd
|
||||
# import matplotlib.pyplot as plt
|
||||
# import numpy as np
|
||||
# import seaborn as sns
|
||||
# from wordcloud import WordCloud, STOPWORDS
|
||||
# from scipy.stats import chi2_contingency
|
||||
#
|
||||
# st.title("📊 Aplikasi Analisis Data & Visualisasi Sentimen")
|
||||
#
|
||||
# uploaded_file = st.file_uploader("Upload dataset CSV atau Excel (Maksimal 50MB)", type=["csv", "xlsx"])
|
||||
#
|
||||
# MAX_FILE_SIZE_MB = 50
|
||||
#
|
||||
# if uploaded_file:
|
||||
# file_size_mb = uploaded_file.size / (1024 * 1024)
|
||||
#
|
||||
# if file_size_mb > MAX_FILE_SIZE_MB:
|
||||
# st.error(f"Ukuran file terlalu besar! Maksimum {MAX_FILE_SIZE_MB}MB.")
|
||||
# else:
|
||||
# try:
|
||||
# if uploaded_file.name.endswith('.csv'):
|
||||
# df = pd.read_csv(uploaded_file, encoding="ISO-8859-1")
|
||||
# else:
|
||||
# df = pd.read_excel(uploaded_file)
|
||||
#
|
||||
# st.success(f"✅ File berhasil diunggah ({file_size_mb:.2f} MB).")
|
||||
# st.write("📌 **Data yang diunggah:**")
|
||||
# st.dataframe(df.head())
|
||||
#
|
||||
# if "clean_text" in df.columns and "sentiment_label" in df.columns:
|
||||
# tab1, tab2, tab3 = st.tabs(["Tanpa Chi-Square", "Dengan Chi-Square", "Word Cloud"])
|
||||
#
|
||||
# with tab1:
|
||||
# st.subheader("📊 Distribusi Sentimen Tanpa Chi-Square")
|
||||
# sentiment_counts = df["sentiment_label"].value_counts()
|
||||
#
|
||||
# fig, ax = plt.subplots(figsize=(8, 4))
|
||||
# sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, ax=ax, palette="Set2")
|
||||
# ax.set_title("Distribusi Sentimen Tanpa Chi-Square")
|
||||
# ax.set_xlabel("Sentiment Label")
|
||||
# ax.set_ylabel("Frekuensi")
|
||||
# st.pyplot(fig)
|
||||
#
|
||||
#
|
||||
# with tab2:
|
||||
# st.subheader("📊 Analisis Chi-Square")
|
||||
#
|
||||
# contingency_table = pd.crosstab(df["sentiment_label"], columns="count")
|
||||
# chi2, p, dof, expected = chi2_contingency(contingency_table)
|
||||
#
|
||||
# # DataFrame untuk perbandingan observed vs expected
|
||||
# observed = contingency_table["count"]
|
||||
# expected = pd.Series(np.round(expected).flatten(), index=contingency_table.index)
|
||||
#
|
||||
# chi_df = pd.DataFrame({
|
||||
# "Observed": observed,
|
||||
# "Expected": expected
|
||||
# })
|
||||
#
|
||||
# st.write("📌 **Hasil distribusi sentimen chi-square:**")
|
||||
# st.dataframe(chi_df)
|
||||
#
|
||||
# # Bar Chart
|
||||
# fig, ax = plt.subplots(figsize=(8, 4))
|
||||
# # chi_df.plot(kind="bar", ax=ax)
|
||||
# chi_df[["Observed"]].plot(kind="bar", ax=ax, legend=True, color="skyblue")
|
||||
# plt.title("Distribusi sentimen dengan chi-square")
|
||||
# plt.xlabel("Sentiment Label")
|
||||
# plt.ylabel("Frekuensi")
|
||||
# plt.xticks(rotation=0)
|
||||
# st.pyplot(fig)
|
||||
#
|
||||
# st.write(f"📌 **Nilai Chi-Square**: {chi2:.2f}")
|
||||
# st.write(f"📌 **p-value**: {p:.4f} (Jika < 0.05, berarti ada perbedaan signifikan)")
|
||||
#
|
||||
#
|
||||
#
|
||||
# with tab3:
|
||||
# st.subheader("☁️ WordCloud per Sentimen")
|
||||
# stopwords = set(STOPWORDS)
|
||||
#
|
||||
# for label in df["sentiment_label"].unique():
|
||||
# text = " ".join(df[df["sentiment_label"] == label]["clean_text"].dropna())
|
||||
# wordcloud = WordCloud(width=800, height=400, stopwords=stopwords, background_color="white").generate(text)
|
||||
# st.image(wordcloud.to_array(), caption=f"WordCloud - {label}")
|
||||
#
|
||||
# else:
|
||||
# st.error("❌ Dataset harus memiliki kolom 'clean_text' dan 'sentiment_label'!")
|
||||
#
|
||||
# except Exception as e:
|
||||
# st.error(f"Terjadi kesalahan saat membaca file: {e}")
|
||||
|
||||
from pyngrok import ngrok
|
||||
import time
|
||||
|
||||
# Ganti dengan token ngrok Anda
|
||||
!ngrok authtoken 2uYp3Yv9sSFl9B6VT5dXt1qXSXS_3xG2usiFQQiRo8ca2y5Bh
|
||||
|
||||
# Pastikan kamu sudah menulis app.py sebelumnya
|
||||
!streamlit run app.py &
|
||||
time.sleep(10)
|
||||
|
||||
# Tunggu sebentar lalu expose ke ngrok
|
||||
public_url = ngrok.connect(8501)
|
||||
print(f"Aplikasi bisa diakses publik via: {public_url}")
|
||||
|
||||
!ngrok http 8501 & streamlit run app.py --server.port 8501 --server.enableCORS false --server.headless true > log.txt 2>&1 &
|
Loading…
Reference in New Issue