846 lines
27 KiB
Python
846 lines
27 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""ready.ipynb
|
|
|
|
Automatically generated by Colab.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1NIVpwbG32d21mzqKM560fRvL2K4herFa
|
|
|
|
####Connect to google drive
|
|
"""
|
|
|
|
from google.colab import drive
|
|
drive.mount('/content/drive')
|
|
|
|
"""####Menampilkan file csv"""
|
|
|
|
import pandas as pd
|
|
|
|
# Membaca dataset CSV
|
|
file_path = '/content/drive/MyDrive/Dataset/Pelabelan.csv'
|
|
data = pd.read_csv(file_path)
|
|
|
|
# Menampilkan data
|
|
print(data.head())
|
|
|
|
data.info()
|
|
|
|
"""####Menampilkan Kolom full_text"""
|
|
|
|
import pandas as pd
|
|
|
|
# Memuat dataset dari file CSV
|
|
df = pd.read_csv("/content/drive/MyDrive/Dataset/Pelabelan.csv")
|
|
|
|
# Menampilkan kolom fulltext dan label
|
|
df_subset = df[['full_text', 'label']]
|
|
|
|
# Menampilkan beberapa baris pertama dari kolom tersebut
|
|
print(df_subset.head())
|
|
|
|
"""####Menampilkan Jumlah Sentimen"""
|
|
|
|
import pandas as pd
|
|
|
|
data = pd.read_csv("/content/drive/MyDrive/Dataset/Pelabelan.csv")
|
|
# Menampilkan jumlah label
|
|
jumlah_label = df['label'].value_counts()
|
|
|
|
# Menampilkan hasil
|
|
print("Jumlah label sentimen:")
|
|
print(f"Positif: {jumlah_label.get(1, 0)}")
|
|
print(f"Netral: {jumlah_label.get(0, 0)}")
|
|
print(f"Negatif: {jumlah_label.get(-1, 0)}")
|
|
|
|
"""##Pre-Processing
|
|
|
|
####Wordcloud Sebelum Pre-processing
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from PIL import Image
|
|
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Membaca file CSV
|
|
df = pd.read_csv('/content/drive/MyDrive/Dataset/Pelabelan.csv') # Ganti dengan path file CSV Anda
|
|
|
|
df['full_text'] = df['full_text'].fillna('')
|
|
|
|
text = ' '.join(df['full_text'].astype(str).tolist())
|
|
|
|
stopwords = set(STOPWORDS)
|
|
stopwords.update(['https', 'co', 'RT','...', 'amp','t'])
|
|
|
|
# Membuat WordCloud
|
|
wc = WordCloud(stopwords=stopwords, background_color="white", max_words=500, width=800, height=400)
|
|
|
|
wc.generate(text)
|
|
|
|
# Menampilkan WordCloud
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wc, interpolation='bilinear')
|
|
plt.axis("off")
|
|
plt.show()
|
|
|
|
"""####Frekuensi Kata"""
|
|
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
from collections import Counter
|
|
import re
|
|
|
|
# Load Data
|
|
df = pd.read_csv('/content/drive/MyDrive/Dataset/Pelabelan.csv')
|
|
text = " ".join(df["full_text"])
|
|
|
|
# Normalisasi teks: huruf kecil dan hanya ambil kata
|
|
text = text.lower()
|
|
tokens = re.findall(r'\b\w+\b', text) # Menghilangkan tanda baca
|
|
|
|
# Daftar kata yang ingin dihapus
|
|
custom_stopwords = {'https', 'co', 'rt', 'amp', 't', '...'}
|
|
|
|
# Filter token yang tidak ada di stopwords
|
|
filtered_tokens = [word for word in tokens if word not in custom_stopwords]
|
|
|
|
# Hitung frekuensi kata setelah filtering
|
|
word_counts = Counter(filtered_tokens)
|
|
|
|
# Tampilkan WordCloud
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis("off")
|
|
plt.title("WordCloud dari Frekuensi Kata")
|
|
plt.show()
|
|
|
|
# Tampilkan Frekuensi Kata dengan Bar Chart
|
|
top_words = word_counts.most_common(10) # Ambil 10 kata terbanyak
|
|
words, counts = zip(*top_words)
|
|
colors = plt.cm.tab10(range(len(words)))
|
|
|
|
plt.figure(figsize=(12, 6))
|
|
plt.bar(words, counts, color=colors)
|
|
plt.xlabel("Kata-Kata Sering Muncul", fontsize=12, fontweight='bold')
|
|
plt.ylabel("Jumlah Kata", fontsize=12, fontweight='bold')
|
|
plt.title("Frekuensi Kata", fontsize=12, fontweight='bold')
|
|
plt.xticks(rotation=45)
|
|
|
|
plt.show()
|
|
|
|
"""####Cleaning Data"""
|
|
|
|
import pandas as pd
|
|
import re
|
|
import string
|
|
|
|
# Fungsi untuk membersihkan teks tweet
|
|
def clean_tweet(tweet):
|
|
# 1. Menghilangkan URL
|
|
tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
|
|
|
|
# 2. Menghapus @mentions
|
|
tweet = re.sub(r'@\w+', '', tweet)
|
|
|
|
# 3. Menghilangkan Hashtag (#)
|
|
tweet = re.sub(r'#\w+', '', tweet)
|
|
|
|
# 4. Menghilangkan Tanda Baca
|
|
tweet = tweet.translate(str.maketrans('', '', string.punctuation))
|
|
|
|
# 5. Menghilangkan Angka
|
|
tweet = re.sub(r'\d+', '', tweet)
|
|
|
|
# 6. Menghapus spasi ekstra
|
|
tweet = tweet.strip()
|
|
|
|
return tweet
|
|
|
|
# Membaca file CSV yang berisi tweet
|
|
file_path = '/content/drive/MyDrive/Dataset/Pelabelan.csv'
|
|
df = pd.read_csv(file_path)
|
|
|
|
# Asumsikan kolom yang berisi tweet bernama 'full_text' dan kolom label bernama 'label'
|
|
# Menambahkan kolom baru untuk tweet yang sudah dibersihkan
|
|
df['cleaning_data'] = df['full_text'].apply(clean_tweet)
|
|
|
|
# Menampilkan dataframe dengan kolom original, cleaned tweet, dan label
|
|
df_cleaned = df[['full_text', 'cleaning_data', 'label']] # Menambahkan kolom label
|
|
|
|
# Tampilkan hasil dalam bentuk dataframe
|
|
print(df_cleaned.head()) # Menampilkan 5 baris pertama dari dataframe
|
|
|
|
# Simpan hasil ke file CSV
|
|
df_cleaned.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/cleaning_data.csv', index=False)
|
|
|
|
"""####Case Folding"""
|
|
|
|
import pandas as pd
|
|
|
|
# Membaca file CSV yang sudah dibersihkan
|
|
file_path = '/content/drive/MyDrive/Dataset/cleaning_data.csv'
|
|
df = pd.read_csv(file_path)
|
|
|
|
# Melakukan case folding pada kolom 'cleaned_tweet'
|
|
df['case_folding_lower'] = df['cleaning_data'].str.lower()
|
|
|
|
# Menampilkan hasil DataFrame dengan kolom baru 'cleaned_tweet_lower'
|
|
print(df[['cleaning_data', 'case_folding_lower','label']].head())
|
|
|
|
# Jika ingin menyimpan hasilnya ke CSV baru
|
|
df.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/case_folding_data.csv', index=False)
|
|
|
|
"""####Tokenizing"""
|
|
|
|
import pandas as pd
|
|
import nltk
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
# Download tokenizer jika belum ada
|
|
nltk.download('punkt_tab')
|
|
|
|
# Membaca file CSV yang berisi data teks
|
|
file_path = '/content/drive/MyDrive/Dataset/case_folding_data.csv' # Gantilah dengan path file yang sesuai
|
|
df = pd.read_csv(file_path)
|
|
|
|
# Asumsikan kolom yang berisi teks bernama 'cleaned_tweet_lower'
|
|
# Melakukan tokenization pada kolom 'cleaned_tweet_lower'
|
|
df['tokens'] = df['case_folding_lower'].apply(word_tokenize)
|
|
|
|
# Menampilkan hasil DataFrame dengan kolom baru 'tokens'
|
|
print(df[['case_folding_lower', 'tokens', 'label']].head())
|
|
|
|
# Jika ingin menyimpan hasilnya ke CSV baru
|
|
df.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/tokenizing_data.csv', index=False)
|
|
|
|
"""####Normalisasi"""
|
|
|
|
import pandas as pd
|
|
|
|
# Membaca file CSV yang berisi istilah normalisasi
|
|
norm_file_path = '/content/drive/MyDrive/Dataset/baru.csv' # Gantilah dengan path file yang sesuai
|
|
norm_df = pd.read_csv(norm_file_path)
|
|
|
|
# Membuat dictionary dari DataFrame
|
|
norm = dict(zip(norm_df['normalisasi1'], norm_df['normalisasi2']))
|
|
|
|
def normalisasi(tokens):
|
|
# Mengganti token sesuai dengan dictionary norm
|
|
normalized_tokens = [norm.get(token, token) for token in tokens]
|
|
return ' '.join(normalized_tokens) # Menggabungkan kembali menjadi string
|
|
|
|
# Membaca file CSV yang sudah dilakukan tokenisasi
|
|
file_path = '/content/drive/MyDrive/Dataset/tokenizing_data.csv' # Gantilah dengan path file yang sesuai
|
|
data = pd.read_csv(file_path)
|
|
|
|
# Melakukan normalisasi pada kolom 'tokens'
|
|
data['normalisasi_data'] = data['tokens'].apply(lambda x: normalisasi(eval(x))) # Menggunakan eval untuk mengonversi string ke list
|
|
|
|
# Menampilkan hasil DataFrame dengan kolom baru 'normalized_text'
|
|
print(data[['tokens', 'normalisasi_data', 'label']].head())
|
|
|
|
# Jika ingin menyimpan hasilnya ke CSV baru
|
|
data.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/normalisasi_data.csv', index=False)
|
|
|
|
"""####Stopword"""
|
|
|
|
!pip install Sastrawi
|
|
|
|
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
|
from nltk.stem import PorterStemmer
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
|
|
import Sastrawi
|
|
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
|
|
import pandas as pd
|
|
|
|
# Inisialisasi stopwords dan stopword remover
|
|
more_stop_word = []
|
|
|
|
stop_words = StopWordRemoverFactory().get_stop_words()
|
|
new_array = ArrayDictionary(stop_words)
|
|
stop_words_remover_new = StopWordRemover(new_array)
|
|
|
|
# Fungsi untuk melakukan stopword removal
|
|
def stopword(str_text):
|
|
return stop_words_remover_new.remove(str_text)
|
|
|
|
# Terapkan stopword removal pada kolom 'normalized_text'
|
|
# dan simpan hasilnya di kolom baru 'text_without_stopwords'
|
|
data['stopwords'] = data['normalisasi_data'].apply(lambda x: stopword(x))
|
|
|
|
# Menyimpan hasil ke file CSV baru
|
|
data.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/stopword_data.csv', index=False)
|
|
|
|
"""####Stemming Data"""
|
|
|
|
import Sastrawi
|
|
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
|
import pandas as pd
|
|
|
|
# Membuat objek stemmer
|
|
factory = StemmerFactory()
|
|
stemmer = factory.create_stemmer()
|
|
|
|
# Fungsi untuk melakukan stemming pada teks
|
|
def stem_text(stopwords):
|
|
return stemmer.stem(stopwords)
|
|
|
|
# Membaca file CSV atau DataFrame
|
|
data = pd.read_csv('/content/drive/MyDrive/Dataset/stopword_data.csv') # Ganti dengan path file CSV Anda
|
|
|
|
# Terapkan stemming pada kolom 'normalized_text'
|
|
data['stemming'] = data['stopwords'].apply(lambda x: stem_text(x))
|
|
|
|
# Menampilkan DataFrame dengan kolom stopwords, stemming, dan label
|
|
data_display = data[['stopwords', 'stemming', 'label']] # Menambahkan kolom label
|
|
|
|
# Tampilkan hasil dalam bentuk DataFrame
|
|
print(data_display.head()) # Menampilkan 5 baris pertama dari DataFrame
|
|
|
|
# Menyimpan hasil ke file CSV baru (opsional)
|
|
data.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/stemming_data.csv', index=False)
|
|
|
|
print("Hasil stemming disimpan ke 'file_hasil_stemming.csv'")
|
|
|
|
"""####Menampilkan wordcloud setelah pre-processing data"""
|
|
|
|
import pandas as pd
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Membaca file CSV hasil pre-processing (yang sudah dilakukan stopword removal, stemming, dll.)
|
|
data = pd.read_csv('/content/drive/MyDrive/Dataset/Klasifikasi/stemming_data.csv')
|
|
|
|
# Menggabungkan semua teks dari kolom 'text_stemmed' menjadi satu string
|
|
text = ' '.join(data['stemming'].dropna().tolist())
|
|
|
|
# Membuat WordCloud
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
|
|
|
|
# Menampilkan WordCloud
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off') # Menghilangkan axis
|
|
plt.show()
|
|
|
|
"""####Menampilkan data setelah pre-processing"""
|
|
|
|
import pandas as pd
|
|
|
|
# Membaca dataset CSV
|
|
file_path = '/content/drive/MyDrive/Dataset/stemming_data.csv'
|
|
data = pd.read_csv(file_path)
|
|
|
|
# Menampilkan data
|
|
print(data.head(10)) # Menampilkan 5 baris pertama
|
|
|
|
import pandas as pd
|
|
|
|
# Memuat dataset dari file CSV
|
|
df = pd.read_csv("/content/drive/MyDrive/Dataset/Klasifikasi/stemming_data.csv")
|
|
|
|
# Menampilkan kolom fulltext dan label
|
|
df_subset = df[['stemming', 'label']]
|
|
|
|
# Menampilkan beberapa baris pertama dari kolom tersebut
|
|
print(df_subset.head())
|
|
df_subset.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/hasil.csv', index=0)
|
|
|
|
"""####Cek data yang kosong"""
|
|
|
|
import pandas as pd
|
|
|
|
# Contoh: Membaca dataset dari file CSV
|
|
df = pd.read_csv('/content/drive/MyDrive/Dataset/Klasifikasi/stemming_data.csv')
|
|
|
|
df.replace(['N/A', 'NULL', 'null'], pd.NA, inplace=True)
|
|
|
|
# Menampilkan jumlah nilai kosong (NaN) di setiap kolom
|
|
missing_values = df.isnull().sum()
|
|
|
|
# Menampilkan kolom yang memiliki nilai kosong
|
|
columns_with_missing = missing_values[missing_values > 0]
|
|
|
|
print("Kolom yang memiliki nilai kosong:\n", columns_with_missing)
|
|
|
|
print(df.isnull().sum())
|
|
|
|
empty_columns = df.columns[df.isnull().all()]
|
|
print(empty_columns)
|
|
|
|
print(df.dtypes)
|
|
|
|
import pandas as pd
|
|
|
|
# Memuat dataset dari file CSV
|
|
df = pd.read_csv("/content/drive/MyDrive/Dataset/Klasifikasi/stemming_data.csv")
|
|
|
|
# Menampilkan kolom fulltext dan label
|
|
df_subset = df[['stemming', 'label']]
|
|
|
|
# Menampilkan beberapa baris pertama dari kolom tersebut
|
|
print(df_subset.head())
|
|
|
|
"""####Penghitungan Manual Tf-Idf"""
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
# Data dokumen
|
|
documents = [
|
|
"putus mk semua harap terima hasil milu",
|
|
"analisa aku bos indonesia punya sejarah baru hasil milu batal",
|
|
"ingin indonesia pimpin hasil milu bersih jurdil martabat anis baswedan salah satu orang"
|
|
]
|
|
|
|
# Konfigurasi TfidfVectorizer agar sesuai dengan perhitungan manual
|
|
vectorizer = TfidfVectorizer(smooth_idf=False, norm=None, use_idf=True)
|
|
|
|
# Transformasi dokumen menjadi TF-IDF
|
|
tfidf_matrix = vectorizer.fit_transform(documents)
|
|
|
|
# Konversi hasil ke DataFrame
|
|
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
|
|
|
# Tampilkan hasil TF-IDF
|
|
print("TF-IDF Matrix:")
|
|
print(df_tfidf)
|
|
|
|
# Menampilkan nilai IDF
|
|
df_idf = pd.DataFrame({"Kata": vectorizer.get_feature_names_out(), "IDF": vectorizer.idf_})
|
|
print("\nIDF Values:")
|
|
print(df_idf)
|
|
|
|
tfidf_sum_per_doc = np.sum(tfidf_matrix.toarray(), axis=1)
|
|
|
|
print("Jumlah total TF-IDF per dokumen:", tfidf_sum_per_doc)
|
|
|
|
# Simpan hasil ke CSV
|
|
#df_tfidf.to_csv("/content/drive/MyDrive/Dataset/hasil_tfidf_3dokumentbaru.csv", index=False)
|
|
#df_idf.to_csv("/content/drive/MyDrive/Dataset/hasil_idf_3dokumentbaru.csv", index=False)
|
|
print("\nHasil TF-IDF dan IDF berhasil disimpan ke hasil_tfidf.csv & hasil_idf.csv")
|
|
|
|
import numpy as np
|
|
|
|
# Hitung jumlah total TF-IDF per dokumen
|
|
tfidf_sum_per_doc = np.sum(tfidf_matrix.toarray(), axis=1)
|
|
|
|
print("Jumlah total TF-IDF per dokumen:", tfidf_sum_per_doc)
|
|
|
|
"""####Split Data"""
|
|
|
|
import pandas as pd
|
|
|
|
# Memuat dataset dari file CSV
|
|
df = pd.read_csv("/content/drive/MyDrive/Dataset/Klasifikasi/hasil.csv")
|
|
|
|
# Menampilkan kolom fulltext dan label
|
|
df_subset = df[['stemming', 'label']]
|
|
|
|
# Menampilkan beberapa baris pertama dari kolom tersebut
|
|
print(df_subset.head())
|
|
|
|
df.info()
|
|
|
|
# Split data into training and test set
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
# Pertama, bagi dataset menjadi training (60%) dan sisanya (40%)
|
|
#X_train, X_temp, y_train, y_temp = train_test_split(df['stemming'], df['label'], test_size=0.4, random_state=42)
|
|
# Kedua, bagi data sisanya menjadi validation (20%) dan testing (20%)
|
|
#X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(df['stemming'], df['label'],
|
|
test_size=0.2, random_state=42)
|
|
|
|
#Simpan data latih ke file
|
|
train_set = pd.DataFrame({'text': X_train, 'label': y_train})
|
|
train_set.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/train_9010baru.csv', index=False)
|
|
|
|
#Simpan data latih ke file
|
|
test_set = pd.DataFrame({'text': X_test, 'label': y_test})
|
|
test_set.to_csv('/content/drive/MyDrive/Dataset/Klasifikasi/test_9010baru.csv', index=False)
|
|
|
|
#Menampilkan informasi jumlah data
|
|
print(f'Jumlah Data Latih: {len(X_train)}')
|
|
print(f'Jumlah Data Uji: {len(X_test)}')
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
#Jumlah data latih dan data uji
|
|
train_size = len(X_train)
|
|
test_size = len(X_test)
|
|
|
|
# membuat plot
|
|
plt.figure(figsize=(6, 4))
|
|
bars = plt.bar(['Data Training', 'Data Testing'], [train_size, test_size], color=['blue', 'orange'])
|
|
# menambahkan label untuk setiap bar
|
|
for bar in bars:
|
|
height = bar.get_height()
|
|
plt.text(bar.get_x() + bar.get_width()/2, height + 20, f'{height} ({height / (train_size + test_size) * 100:.2f}%)',
|
|
ha='center', va='bottom')
|
|
plt.title('Jumlah Data Latih dan Data Uji')
|
|
plt.xlabel('Jenis Data')
|
|
plt.ylabel('Jumlah Data')
|
|
plt.show()
|
|
|
|
"""#TF-IDF"""
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.model_selection import train_test_split
|
|
import pandas as pd
|
|
|
|
vectorizer = TfidfVectorizer(smooth_idf=False, norm=None, use_idf=True)
|
|
X_train_tfidf = vectorizer.fit_transform(X_train)
|
|
X_test_tfidf = vectorizer.transform(X_test)
|
|
|
|
# Konversi hasil ke DataFrame
|
|
df_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
|
|
df_idf = pd.DataFrame({"Kata": vectorizer.get_feature_names_out(), "IDF": vectorizer.idf_})
|
|
print("\nTF-IDF Matrix (Training Set):")
|
|
print(df_tfidf)
|
|
print(df_idf)
|
|
|
|
# Simpan hasil ke CSV
|
|
#df_tfidf.to_csv("/content/drive/MyDrive/Dataset/Klasifikasi/hasil_tfidf8020.csv", index=False)
|
|
#df_idf.to_csv("/content/drive/MyDrive/Dataset/Klasifikasi/hasil_idf8020.csv", index=False)
|
|
#print("\nHasil TF-IDF dan IDF berhasil disimpan ke hasil_tfidf.csv & hasil_idf.csv")
|
|
|
|
"""#Chi-square"""
|
|
|
|
from sklearn.feature_selection import chi2
|
|
import numpy as np
|
|
|
|
# Hitung nilai Chi-Square dan p-value
|
|
chi2_scores, p_values = chi2(X_train_tfidf, y_train)
|
|
|
|
# Simpan hasil Chi-Square & p-value ke DataFrame
|
|
chi2_df = pd.DataFrame({
|
|
"Feature": vectorizer.get_feature_names_out(),
|
|
"Chi2 Score": chi2_scores,
|
|
"p-value": p_values
|
|
})
|
|
|
|
# Pilih fitur dengan p-value < 0.05 (tingkat signifikansi 5%)
|
|
alpha = 0.01
|
|
selected_features = chi2_df[chi2_df["p-value"] < alpha]["Feature"].values
|
|
|
|
# Tampilkan fitur yang terpilih berdasarkan p-value
|
|
print("\nFitur Terpilih dengan p-value < 0.01:")
|
|
print(selected_features)
|
|
|
|
chi2_df.to_csv("/content/drive/MyDrive/Dataset/Klasifikasi/pvalue_9010_0.05.csv", index=False)
|
|
|
|
df_selected_features = pd.DataFrame({"Selected Features": selected_features})
|
|
df_selected_features.to_csv("/content/drive/MyDrive/Dataset/Klasifikasi/selected_9010_0.05.csv", index=False)
|
|
|
|
print("\nHasil Chi-Square & p-value berhasil disimpan ke hasil_chi2_pvalue.csv & hasil_chi2_selected.csv")
|
|
|
|
"""#Probabilitas"""
|
|
|
|
import pandas as pd
|
|
|
|
# Memuat dataset dari file CSV
|
|
df = pd.read_csv("/content/drive/MyDrive/Dataset/Klasifikasi/train_9010baru.csv")
|
|
|
|
# Menampilkan kolom fulltext dan label
|
|
df_subset = df[['text', 'label']]
|
|
|
|
# Menampilkan beberapa baris pertama dari kolom tersebut
|
|
print(df_subset.head())
|
|
|
|
df.info()
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
#inisialisasi CountVectorizer
|
|
vectorizer = CountVectorizer()
|
|
X = vectorizer.fit_transform(df['text'])
|
|
features = vectorizer.get_feature_names_out()
|
|
df_tokens = pd.DataFrame(X.toarray(), columns=features)
|
|
|
|
df_tokens = df_tokens.rename(columns=lambda x: x + '_' if x in df.columns else x)
|
|
data_combined = pd.concat([df_tokens, df['label']], axis=1)
|
|
|
|
# menghitung frekuensi token untuk setiap sentimen
|
|
frequency_positif = data_combined[data_combined['label'] == 1].drop('label', axis=1).sum()
|
|
frequency_negatif = data_combined[data_combined['label'] == -1].drop('label', axis=1).sum()
|
|
frequency_netral = data_combined[data_combined['label'] == 0].drop('label', axis=1).sum()
|
|
|
|
# menghitung probabilitas prior
|
|
sentiment_counts = df['label'].value_counts()
|
|
total_samples = len(df)
|
|
prior_probabilities = sentiment_counts / total_samples
|
|
|
|
print("Prior Probabilities:")
|
|
print(prior_probabilities)
|
|
|
|
# Menghitung probabilitas kondisional dengan smoothing laplace
|
|
total_positive = frequency_positif.sum()
|
|
total_negative = frequency_negatif.sum()
|
|
total_netral = frequency_netral.sum()
|
|
|
|
probability_conditional_positive = (frequency_positif + 1) / (total_positive + len(features))
|
|
probability_conditional_negative = (frequency_negatif + -1) / (total_negative + len(features))
|
|
probability_conditional_netral = (frequency_netral + 0) / (total_netral + len(features))
|
|
|
|
print("Probability Conditional_Positif:")
|
|
print(probability_conditional_positive)
|
|
print("\Probability Conditional Negative:")
|
|
print(probability_conditional_negative)
|
|
print("Probability Conditional Netral:")
|
|
print(probability_conditional_netral)
|
|
|
|
def calculate_posterior_probabilities(document):
|
|
words = document.split()
|
|
posterior_positive = prior_probabilities[1]
|
|
posterior_negative = prior_probabilities[-1]
|
|
posterior_netral = prior_probabilities[0]
|
|
|
|
for word in words:
|
|
if word in probability_conditional_positive.index and word in probability_conditional_negative.index and word in probability_conditional_netral.index:
|
|
posterior_positive *= probability_conditional_positive.get(word, 1)
|
|
posterior_negative *= probability_conditional_negative.get(word, 1)
|
|
posterior_netral *= probability_conditional_netral.get(word, 1)
|
|
return {'1': posterior_positive, '-1': posterior_negative, '0': posterior_netral}
|
|
|
|
df['posterior_probabilities'] = df['text'].apply(calculate_posterior_probabilities)
|
|
df.head()
|
|
|
|
"""####Naive Bayes"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
# from sklearn.feature_selection import SelectKBest, chi2
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
|
|
|
|
# from sklearn.feature_selection import SelectFpr, chi2
|
|
# Pilih fitur dengan p-value < 0.05 berdasarkan Chi-Square
|
|
# chi2_selector = SelectFpr(chi2, alpha=0.05)
|
|
# X_train_chi2 = chi2_selector.fit_transform(X_train_tfidf, y_train)
|
|
# X_test_chi2 = chi2_selector.transform(X_test_tfidf) # Terapkan seleksi fitur ke data uji
|
|
|
|
from sklearn.feature_selection import SelectPercentile, chi2
|
|
# Pilih 30% fitur terbaik berdasarkan nilai chi-square
|
|
chi2_selector = SelectPercentile(chi2, percentile=30)
|
|
X_train_chi2 = chi2_selector.fit_transform(X_train_tfidf, y_train)
|
|
X_test_chi2 = chi2_selector.transform(X_test_tfidf) # Terapkan seleksi fitur ke data uji
|
|
|
|
# 1. Pilih fitur terbaik dengan Chi-Square
|
|
# chi2_selector = SelectKBest(chi2, k=1500) # Pilih 2100 fitur terbaik
|
|
# X_train_chi2 = chi2_selector.fit_transform(X_train_tfidf, y_train)
|
|
# X_test_chi2 = chi2_selector.transform(X_test_tfidf) # Terapkan seleksi fitur ke data uji
|
|
|
|
# 2. Latih model Naïve Bayes
|
|
nb_model = MultinomialNB()
|
|
nb_model.fit(X_train_chi2, y_train)
|
|
|
|
# 3. Prediksi
|
|
y_pred = nb_model.predict(X_test_chi2)
|
|
|
|
# 4. Evaluasi Model
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
print("Akurasi:", accuracy)
|
|
print("\nClassification Report:")
|
|
print(classification_report(y_test, y_pred))
|
|
|
|
# 5. Confusion Matrix
|
|
conf_matrix = confusion_matrix(y_test, y_pred)
|
|
|
|
# 6. Visualisasi Confusion Matrix
|
|
plt.figure(figsize=(6, 4))
|
|
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues", xticklabels=['Negatif', 'Netral', 'Positif'],
|
|
yticklabels=['Negatif', 'Netral', 'Positif'])
|
|
plt.title('Confusion Matrix Naïve Bayes')
|
|
plt.xlabel('Predicted')
|
|
plt.ylabel('Actual')
|
|
plt.show()
|
|
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
|
|
# Train model
|
|
nb_model = MultinomialNB()
|
|
nb_model.fit(X_train_chi2, y_train)
|
|
|
|
# Prediksi probabilitas
|
|
y_proba = nb_model.predict_proba(X_test_chi2)
|
|
print("Probabilitas Prediksi:", y_proba)
|
|
|
|
import numpy as np
|
|
|
|
# Prediksi probabilitas untuk data uji
|
|
y_pred = nb_model.predict_proba(X_test_chi2) # Gunakan X_test_selected
|
|
|
|
# Tampilkan contoh hasil prediksi probabilitas
|
|
for i, text in enumerate(X_test[:173]): # Cek 5 contoh pertama
|
|
print(f"Teks: {text}")
|
|
print(f"Probabilitas: {y_pred[i]}") # Probabilitas untuk setiap kelas
|
|
print(f"Kelas Prediksi: {nb_model.classes_[np.argmax(y_pred[i])]} \n")
|
|
|
|
data.head()
|
|
|
|
"""####Naive Bayes Multinomial"""
|
|
|
|
data.info()
|
|
|
|
data.head()
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
|
|
# tfidf
|
|
tfidf = TfidfVectorizer()
|
|
x = tfidf.fit_transform(data['stemming']).toarray()
|
|
y = data['label']
|
|
|
|
#split data
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
# model multinomial
|
|
mnb = MultinomialNB()
|
|
mnb.fit(X_train, y_train)
|
|
|
|
# predict
|
|
y_pred_mnb = mnb.predict(X_test)
|
|
|
|
# evaluasi model
|
|
conf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
|
|
class_report_mnb = classification_report(y_test, y_pred_mnb)
|
|
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
|
|
|
|
print("MultinomialNB Result")
|
|
print("====================")
|
|
print("Confusion Matrix (MultinomialNB):")
|
|
print(conf_matrix_mnb)
|
|
print("===============================")
|
|
print("\nClassification Report (MultinomialNB):")
|
|
print(class_report_mnb)
|
|
print("===============================")
|
|
print("Accuarcy (MultinomialNB): {accuracy_mnb:.4f}")
|
|
print("===============================")
|
|
|
|
# plot confusion matrix
|
|
plt.figure(figsize=(6, 4))
|
|
sns.heatmap(conf_matrix_mnb, annot=True, fmt='d', cmap="Blues", xticklabels=['positif', 'negatif', 'netral'], yticklabels=['positif', 'negatif', 'netral'])
|
|
plt.title('Confusion Matrix (MultinomialNB)')
|
|
plt.xlabel('Predicted')
|
|
plt.ylabel('Actual')
|
|
plt.show()
|
|
|
|
results_mnb = pd.DataFrame({'stemming': data.loc[y_test.index, 'stemming'], 'Actual': y_test, 'Predicted': y_pred_mnb})
|
|
results_mnb.to_csv('/content/drive/MyDrive/Dataset/Hasil_pred_MultinomialNB.csv', encoding='utf8', index=False)
|
|
print("Actual vs Predicted (MultinomialNB):")
|
|
print(f"Accuracy (MultinomialNB): {accuracy_mnb:.4f}")`
|
|
|
|
results_mnb.head()
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
sentiment_count = results_mnb['Actual'].value_counts()
|
|
sns.set_style('whitegrid')
|
|
|
|
fig, ax = plt.subplots(figsize=(4, 2))
|
|
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
|
|
plt.title('Jumlah Analisis Data Actual', fontsize=14, pad=20)
|
|
plt.xlabel('Class Actual', fontsize=12)
|
|
plt.ylabel('Jumlah Tweet', fontsize=12)
|
|
|
|
total = len(results_mnb['Actual'])
|
|
|
|
for i, count in enumerate(sentiment_count.values):
|
|
percentage = f'{100 * count / total:.2f}%'
|
|
ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')
|
|
|
|
plt.show()
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
sentiment_count = results_mnb['Predicted'].value_counts()
|
|
sns.set_style('whitegrid')
|
|
|
|
fig, ax = plt.subplots(figsize=(4, 2))
|
|
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
|
|
plt.title('Jumlah Analisis Data Predicted', fontsize=14, pad=20)
|
|
plt.xlabel('Class Predicted', fontsize=12)
|
|
plt.ylabel('Jumlah Tweet', fontsize=12)
|
|
|
|
total = len(results_mnb['Predicted'])
|
|
|
|
for i, count in enumerate(sentiment_count.values):
|
|
percentage = f'{100 * count / total:.2f}%'
|
|
ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')
|
|
|
|
plt.show()
|
|
|
|
"""#Visualisasi Sentimen"""
|
|
|
|
import pandas as pd
|
|
|
|
# Membaca dataset dari CSV
|
|
data_stemming = pd.read_csv("/content/drive/MyDrive/Dataset/hasil_processing.csv")
|
|
|
|
# Menampilkan beberapa data pertama
|
|
print(data_stemming.head())
|
|
|
|
data_negatif = data_stemming[data_stemming['label'] == -1]
|
|
data_netral = data_stemming[data_stemming['label'] == 0]
|
|
data_positif = data_stemming[data_stemming['label'] == 1]
|
|
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
|
|
all_text_s_negatif = ' '.join(word for word in data_negatif['stemming'])
|
|
wordcloud = WordCloud(colormap='Reds', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s_negatif)
|
|
plt.figure(figsize=(9, 6))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.title("Visualisasi Kata Negatif")
|
|
plt.margins(x=0, y=0)
|
|
plt.show()
|
|
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
|
|
all_text_s1 = ' '.join(word for word in data_positif['stemming'])
|
|
wordcloud = WordCloud(colormap='Blues', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s1)
|
|
plt.figure(figsize=(9, 6))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.title("Visualisasi Kata Positif")
|
|
plt.margins(x=0, y=0)
|
|
plt.show()
|
|
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
|
|
all_text_s0 = ' '.join(word for word in data_netral['stemming'])
|
|
wordcloud = WordCloud(colormap='Greens', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s0)
|
|
plt.figure(figsize=(9, 6))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.title("Visualisasi Kata Netral")
|
|
plt.margins(x=0, y=0)
|
|
plt.show() |