425 lines
17 KiB
Plaintext
425 lines
17 KiB
Plaintext
from flask import Flask, render_template, url_for, request, flash
|
|
import tweepy
|
|
import re, string, csv, pickle, os
|
|
from os.path import join, dirname, realpath
|
|
import pandas as pd
|
|
import numpy as np
|
|
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
|
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
|
|
import nltk
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import word_tokenize
|
|
from googletrans import Translator
|
|
from textblob import TextBlob
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
|
from sklearn.metrics import classification_report
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from PIL import Image
|
|
import urllib.request
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
nltk.download('punkt')
|
|
nltk.download('stopwords')
|
|
|
|
|
|
#Preprocessing Twitter
|
|
hasil_preprocessing = []
|
|
|
|
def preprocessing_twitter():
|
|
translator = Translator()
|
|
hasil_preprocessing.clear()
|
|
|
|
# Siapkan file CSV untuk menampung hasil
|
|
with open('static/files/Data Preprocessing.csv', 'w', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
|
|
# Tulis header CSV
|
|
writer.writerow([
|
|
'Tanggal', 'Username', 'Tweet',
|
|
'Cleansing', 'Case Folding', 'Normalisasi',
|
|
'Tokenizing', 'Stopword', 'Stemming',
|
|
'Translate'
|
|
])
|
|
|
|
# Baca data input dari file scraping
|
|
with open("static/files/Data Scraping.csv", "r", encoding='utf-8') as csvfile:
|
|
readCSV = csv.reader(csvfile, delimiter=',')
|
|
|
|
for row in readCSV:
|
|
# Ambil data asli
|
|
tanggal = row[0]
|
|
username = row[1]
|
|
tweet = row[2]
|
|
|
|
# --- CLEANSING (versi disamakan dengan Google Colab) ---
|
|
clean = tweet
|
|
clean = re.sub(r'@[A-Za-z0-9_]+', '', clean) # hapus mention
|
|
clean = re.sub(r'#\w+', '', clean) # hapus hashtag
|
|
clean = re.sub(r'RT[\s]+', '', clean) # hapus RT
|
|
clean = re.sub(r'https?://\S+', '', clean) # hapus link
|
|
clean = re.sub(r'[^A-Za-z0-9 ]', '', clean) # hapus karakter selain alfanumerik dan spasi
|
|
clean = re.sub(r'\s+', ' ', clean).strip() # hilangkan spasi berlebih
|
|
|
|
# --- CASE FOLDING ---
|
|
casefold = clean.casefold()
|
|
|
|
# --- NORMALISASI ---
|
|
normalized_text = normalize_text(casefold)
|
|
|
|
# --- TOKENIZING ---
|
|
tokenizing = nltk.tokenize.word_tokenize(normalized_text)
|
|
|
|
# --- STOPWORD REMOVAL ---
|
|
stop_factory = StopWordRemoverFactory().get_stop_words()
|
|
more_stop_word = ['tidak']
|
|
all_stopwords = stop_factory + more_stop_word
|
|
dictionary = ArrayDictionary(all_stopwords)
|
|
stopword_remover = StopWordRemover(dictionary)
|
|
stopword_removed = nltk.tokenize.word_tokenize(stopword_remover.remove(normalized_text))
|
|
|
|
# --- STEMMING ---
|
|
kalimat = ' '.join(stopword_removed)
|
|
factory = StemmerFactory()
|
|
stemmer = factory.create_stemmer()
|
|
stemming = stemmer.stem(kalimat)
|
|
|
|
# --- TRANSLATE ---
|
|
try:
|
|
translation = translator.translate(stemming, dest='en')
|
|
translated_text = translation.text
|
|
except:
|
|
translated_text = "Terjemahan gagal"
|
|
|
|
# --- SIMPAN SEMUA HASIL ---
|
|
tweets = [
|
|
tanggal, username, tweet,
|
|
clean, casefold, normalized_text,
|
|
tokenizing, stopword_removed, stemming,
|
|
translated_text
|
|
]
|
|
hasil_preprocessing.append(tweets)
|
|
writer.writerow(tweets)
|
|
|
|
flash('Preprocessing Berhasil', 'preprocessing_data')
|
|
|
|
def normalize_text(text):
|
|
# Normalisasi manual kata tidak baku ke bentuk baku (berdasarkan kamus dari Google Colab)
|
|
text = re.sub(r'sdh', 'sudah', text)
|
|
text = re.sub(r' yg ', ' yang ', text)
|
|
text = re.sub(r' nggak ', ' tidak ', text)
|
|
text = re.sub(r' gak ', ' tidak ', text)
|
|
text = re.sub(r' bangetdari ', ' banget dari ', text)
|
|
text = re.sub(r'vibes ', 'suasana ', text)
|
|
text = re.sub(r'mantab ', 'mantap ', text)
|
|
text = re.sub(r' benarsetuju ', ' benar setuju ', text)
|
|
text = re.sub(r' ganjarmahfud ', ' ganjar mahfud ', text)
|
|
text = re.sub(r' stylish ', ' bergaya ', text)
|
|
text = re.sub(r' ngapusi ', ' bohong ', text)
|
|
text = re.sub(r' gede ', ' besar ', text)
|
|
text = re.sub(r' all in ', ' yakin ', text)
|
|
text = re.sub(r' blokkkkk ', ' goblok ', text)
|
|
text = re.sub(r' blokkkk ', ' goblok ', text)
|
|
text = re.sub(r' blokkk ', ' goblok ', text)
|
|
text = re.sub(r' blokk ', ' goblok ', text)
|
|
text = re.sub(r' blok ', ' goblok ', text)
|
|
text = re.sub(r' ri ', ' republik indonesia ', text)
|
|
text = re.sub(r' kem3nangan ', ' kemenangan ', text)
|
|
text = re.sub(r' sat set ', ' cepat ', text)
|
|
text = re.sub(r' ala ', ' dari ', text)
|
|
text = re.sub(r' best ', ' terbaik ', text)
|
|
text = re.sub(r' bgttt ', ' banget ', text)
|
|
text = re.sub(r' gue ', ' saya ', text)
|
|
text = re.sub(r' hrs ', ' harus ', text)
|
|
text = re.sub(r' fixed ', ' tetap ', text)
|
|
text = re.sub(r' blom ', ' belum ', text)
|
|
text = re.sub(r' aing ', ' aku ', text)
|
|
text = re.sub(r' tehnologi ', ' teknologi ', text)
|
|
text = re.sub(r' jd ', ' jadi ', text)
|
|
text = re.sub(r' dg ', ' dengan ', text)
|
|
text = re.sub(r' kudu ', ' harus ', text)
|
|
text = re.sub(r' jk ', ' jika ', text)
|
|
text = re.sub(r' problem ', ' masalah ', text)
|
|
text = re.sub(r' iru ', ' itu ', text)
|
|
text = re.sub(r' duit ', ' uang ', text)
|
|
text = re.sub(r' duid ', ' uang ', text)
|
|
text = re.sub(r' bgsd ', ' bangsat ', text)
|
|
text = re.sub(r' jt ', ' juta ', text)
|
|
text = re.sub(r' stop ', ' berhenti ', text)
|
|
text = re.sub(r' ngeri ', ' seram ', text)
|
|
text = re.sub(r' turu ', ' tidur ', text)
|
|
text = re.sub(r' early ', ' awal ', text)
|
|
text = re.sub(r' pertamna ', ' pertamina ', text)
|
|
text = re.sub(r' yg ', ' yang ', text)
|
|
text = re.sub(r' mnurut ', ' menurut ', text)
|
|
text = re.sub(r' trus ', ' terus ', text)
|
|
text = re.sub(r' msh ', ' masih ', text)
|
|
text = re.sub(r' simple ', ' mudah ', text)
|
|
text = re.sub(r' worth ', ' layak ', text)
|
|
text = re.sub(r'problem ', 'masalah ', text)
|
|
text = re.sub(r' hny ', ' hanya ', text)
|
|
text = re.sub(r' dn ', ' dan ', text)
|
|
text = re.sub(r' jln ', ' jalan ', text)
|
|
text = re.sub(r' bgt ', ' banget ', text)
|
|
text = re.sub(r' yg ', ' yang ', text)
|
|
text = re.sub(r' ga ', ' tidak ', text)
|
|
text = re.sub(r' text ', ' teks ', text)
|
|
text = re.sub(r' end ', ' selesai ', text)
|
|
text = re.sub(r' kelen ', ' kalian ', text)
|
|
text = re.sub(r' jd ', ' jadi ', text)
|
|
text = re.sub(r' tuk ', ' untuk ', text)
|
|
text = re.sub(r' kk ', ' kakak ', text)
|
|
|
|
return text
|
|
|
|
# Labeling 5 Kelas
|
|
hasil_labeling = []
|
|
|
|
def labeling_twitter():
|
|
hasil_labeling.clear()
|
|
|
|
with open("static/files/Data Preprocessing.csv", "r", encoding='utf-8') as csvfile:
|
|
readCSV = csv.reader(csvfile, delimiter=',')
|
|
next(readCSV) # Lewati header CSV
|
|
|
|
with open('static/files/Data Labeling.csv', 'w', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
|
|
# Header file hasil labeling
|
|
writer.writerow(['Tanggal', 'Username', 'Tweet', 'Stemming', 'Translate', 'Label'])
|
|
|
|
for row in readCSV:
|
|
tanggal = row[0]
|
|
username = row[1]
|
|
tweet_asli = row[2]
|
|
stemming = row[8]
|
|
translated = row[9] # hasil translate
|
|
|
|
try:
|
|
analysis = TextBlob(translated)
|
|
score = analysis.sentiment.polarity
|
|
except Exception as e:
|
|
score = 0.0 # Jika gagal, asumsikan netral
|
|
|
|
# Penentuan label berdasarkan polaritas
|
|
if score >= 0.6:
|
|
label = 'Sangat Mendukung'
|
|
elif 0.2 <= score < 0.6:
|
|
label = 'Mendukung'
|
|
elif -0.2 < score < 0.2:
|
|
label = 'Netral'
|
|
elif -0.6 < score <= -0.2:
|
|
label = 'Tidak Mendukung'
|
|
else:
|
|
label = 'Sangat Tidak Mendukung'
|
|
|
|
hasil = [tanggal, username, tweet_asli, stemming, translated, label]
|
|
hasil_labeling.append(hasil)
|
|
writer.writerow(hasil)
|
|
|
|
flash('Labeling 5 Kelas Berhasil', 'labeling_data')
|
|
|
|
#Klasifikasi
|
|
|
|
# Membuat variabel df
|
|
df = None
|
|
df2 = None
|
|
|
|
# menentukan akurasi 0
|
|
akurasi = 0
|
|
|
|
def proses_klasifikasi():
|
|
global df
|
|
global df2
|
|
global akurasi
|
|
tweet = []
|
|
y = []
|
|
|
|
# Baca data labeling
|
|
with open("static/files/Data Labeling.csv", encoding='utf-8') as csvfile:
|
|
readCSV = csv.reader(csvfile, delimiter=',')
|
|
next(readCSV) # Lewati header
|
|
|
|
for row in readCSV:
|
|
tweet_text = row[4] # kolom Translate
|
|
label = row[5] # kolom Label
|
|
|
|
if tweet_text.lower() != "terjemahan gagal": # filter data gagal translate
|
|
tweet.append(tweet_text)
|
|
y.append(label)
|
|
|
|
# TF-IDF
|
|
vectorizer = TfidfVectorizer()
|
|
x = vectorizer.fit_transform(tweet)
|
|
|
|
# Split data training dan testing
|
|
x_train, x_test, y_train, y_test = train_test_split(
|
|
x, y, test_size=0.2, random_state=42
|
|
)
|
|
|
|
# Naive Bayes
|
|
clf = MultinomialNB()
|
|
clf.fit(x_train, y_train)
|
|
predict = clf.predict(x_test)
|
|
|
|
# Simpan classification report ke CSV
|
|
report = classification_report(y_test, predict, output_dict=True)
|
|
clsf_report = pd.DataFrame(report).transpose()
|
|
clsf_report.to_csv('static/files/Data Klasifikasi.csv', index=True)
|
|
|
|
# Simpan model dan vectorizer
|
|
pickle.dump(vectorizer, open('static/files/vec.pkl', 'wb'))
|
|
pickle.dump(x, open('static/files/tfidf.pkl', 'wb'))
|
|
pickle.dump(clf, open('static/files/model.pkl', 'wb'))
|
|
|
|
# Confusion Matrix
|
|
unique_label = np.unique([y_test, predict])
|
|
cmtx = pd.DataFrame(
|
|
confusion_matrix(y_test, predict, labels=unique_label),
|
|
index=['pred:{:}'.format(x) for x in unique_label],
|
|
columns=['true:{:}'.format(x) for x in unique_label]
|
|
)
|
|
cmtx.to_csv('static/files/Data Confusion Matrix.csv', index=True)
|
|
|
|
# Baca ulang hasil evaluasi
|
|
df = pd.read_csv('static/files/Data Confusion Matrix.csv', sep=",")
|
|
df.rename(columns={'Unnamed: 0': ''}, inplace=True)
|
|
|
|
df2 = pd.read_csv('static/files/Data Klasifikasi.csv', sep=",")
|
|
df2.rename(columns={'Unnamed: 0': ''}, inplace=True)
|
|
|
|
# Hitung akurasi
|
|
akurasi = round(accuracy_score(y_test, predict) * 100, 2)
|
|
|
|
# Wordcloud
|
|
kalimat = "".join(tweet)
|
|
urllib.request.urlretrieve(
|
|
"https://firebasestorage.googleapis.com/v0/b/sentimen-97d49.appspot.com/o/Circle-icon.png?alt=media&token=b9647ca7-dfdb-46cd-80a9-cfcaa45a1ee4",
|
|
'circle.png')
|
|
mask = np.array(Image.open("circle.png"))
|
|
wordcloud = WordCloud(width=1600, height=800,
|
|
max_font_size=200, background_color='white', mask=mask)
|
|
wordcloud.generate(kalimat)
|
|
plt.figure(figsize=(12, 10))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis("off")
|
|
plt.savefig('static/files/wordcloud.png')
|
|
|
|
flash('Klasifikasi Berhasil', 'klasifikasi_data')
|
|
|
|
|
|
app = Flask(__name__)
|
|
app.config['SECRET_KEY'] = 'farez'
|
|
|
|
|
|
# Upload folder
|
|
UPLOAD_FOLDER = 'static/files'
|
|
ALLOWED_EXTENSION = set(['csv'])
|
|
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
|
|
|
def allowed_file(filename):
|
|
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSION
|
|
|
|
@app.route('/')
|
|
def index():
|
|
return render_template('index.html')
|
|
|
|
@app.route('/preprocessing', methods=['GET', 'POST'])
|
|
def preprocessing():
|
|
if request.method == 'POST':
|
|
if request.form.get('upload') == 'Upload Data':
|
|
hasil_preprocessing.clear()
|
|
file = request.files['file']
|
|
if not allowed_file(file.filename):
|
|
flash('Format file tidak diperbolehkan', 'upload_gagal')
|
|
return render_template('preprocessing.html', value=hasil_preprocessing)
|
|
|
|
if 'file' not in request.files:
|
|
flash('File tidak boleh kosong', 'upload_gagal')
|
|
return render_template('preprocessing.html', value=hasil_preprocessing)
|
|
|
|
if file.filename == '':
|
|
flash('File tidak boleh kosong', 'upload_gagal')
|
|
return render_template('preprocessing.html', value=hasil_preprocessing)
|
|
|
|
if file and allowed_file(file.filename):
|
|
file.filename = "Data Scraping.csv"
|
|
file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
|
|
hasil_preprocessing.clear()
|
|
flash('File Berhasil di upload', 'upload_berhasil')
|
|
return render_template('preprocessing.html')
|
|
|
|
if request.form.get('preprocess') == 'Preprocessing Data':
|
|
preprocessing_twitter()
|
|
return render_template('preprocessing.html', value=hasil_preprocessing)
|
|
|
|
return render_template('preprocessing.html', value=hasil_preprocessing)
|
|
|
|
@app.route('/labeling', methods=['GET', 'POST'])
|
|
def labeling():
|
|
if request.method == 'POST':
|
|
if request.form.get('upload') == 'Upload Data':
|
|
hasil_labeling.clear()
|
|
file = request.files['file']
|
|
if not allowed_file(file.filename):
|
|
flash('Format file tidak diperbolehkan', 'upload_gagal')
|
|
return render_template('labeling.html', value=hasil_labeling)
|
|
|
|
if 'file' not in request.files:
|
|
flash('File tidak boleh kosong', 'upload_gagal')
|
|
return render_template('labeling.html', value=hasil_labeling)
|
|
|
|
if file.filename == '':
|
|
flash('File tidak boleh kosong', 'upload_gagal')
|
|
return render_template('labeling.html', value=hasil_labeling)
|
|
|
|
if file and allowed_file(file.filename):
|
|
file.filename = "Data Preprocessing.csv"
|
|
file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
|
|
hasil_labeling.clear()
|
|
flash('File Berhasil di upload', 'upload_berhasil')
|
|
return render_template('labeling.html')
|
|
|
|
if request.form.get('labeling') == 'Labeling Data':
|
|
labeling_twitter()
|
|
return render_template('labeling.html', value=hasil_labeling)
|
|
|
|
return render_template('labeling.html', value=hasil_labeling)
|
|
|
|
@app.route('/klasifikasi', methods=['GET', 'POST'])
|
|
def klasifikasi():
|
|
if request.method == 'POST':
|
|
if request.form.get('upload') == 'Upload Data':
|
|
file = request.files['file']
|
|
if not allowed_file(file.filename):
|
|
flash('Format file tidak diperbolehkan', 'upload_gagal')
|
|
return render_template('klasifikasi.html')
|
|
if 'file' not in request.files:
|
|
flash('File tidak boleh kosong', 'upload_gagal')
|
|
return render_template('klasifikasi.html',)
|
|
if file.filename == '':
|
|
flash('File tidak boleh kosong', 'upload_gagal')
|
|
return render_template('klasifikasi.html')
|
|
if file and allowed_file(file.filename):
|
|
file.filename = "Data Labeling.csv"
|
|
file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))
|
|
flash('File Berhasil di upload', 'upload_berhasil')
|
|
return render_template('klasifikasi.html')
|
|
|
|
if request.form.get('klasifikasi') == 'Klasifikasi Data':
|
|
proses_klasifikasi()
|
|
return render_template('klasifikasi.html', accuracy=akurasi, tables=[df.to_html(classes='table table-bordered', index=False, justify='left')], titles=df.columns.values, tables2=[df2.to_html(classes='table table-bordered', index=False, justify='left')], titles2=df2.columns.values)
|
|
|
|
if akurasi == 0:
|
|
return render_template('klasifikasi.html')
|
|
else:
|
|
return render_template('klasifikasi.html', accuracy=akurasi, tables=[df.to_html(classes='table table-bordered', index=False, justify='left')], titles=df.columns.values, tables2=[df2.to_html(classes='table table-bordered', index=False, justify='left')], titles2=df2.columns.values)
|
|
|
|
@app.route('/visualisasi')
|
|
def visualisasi():
|
|
return render_template('visualisasi.html')
|
|
|
|
if __name__ == "__main__":
|
|
app.run(debug=True) |