TIF_E41200378/app.py

import os
from flask import Flask, render_template, request, redirect
import csv
import nltk
import pandas
from werkzeug.utils import secure_filename
from sentimen import lower, remove_punctuation, remove_stopwords, stem_text, preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from yellowbrick.text import TSNEVisualizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.stem import PorterStemmer
import pickle
import math
import pandas as pd
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stp = stopwords.words('indonesian')
factory = StemmerFactory()
stemmer = factory.create_stemmer()

app = Flask(__name__)
app.config["TEMPLATES_AUTO_RELOAD"] = True
if __name__ == "__main__":
    app.run(debug=True)


# dashboard get data
@app.route('/', methods=['GET', 'POST'])
def index():
    if os.path.exists('uploads/dataset.csv'):
        text = pandas.read_csv('uploads/dataset.csv', encoding='latin-1')
        text.dropna(axis=0)
        positif, negatif= text['Labels'].value_counts()
        total = positif + negatif
        return render_template('index.html',total=total, positif=positif, negatif=negatif)
    else:
        return render_template('index.html')


#upload data
ALLOWED_EXTENSION = set(['csv'])
app.config['UPLOAD_FOLDER']='uploads'

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSION

@app.route('/uploaddata', methods=['GET', 'POST'])
def uploaddata():
    if request.method == 'GET':
        if os.path.exists('uploads/dataset.csv'):
            text = pandas.read_csv('uploads/dataset.csv', encoding='latin-1').head(100)
            # Inisialisasi list untuk menyimpan baris tabel HTML
            table_rows = []
                # Konversi setiap baris DataFrame menjadi baris HTML dan tambahkan ke list table_rows
            for index, row in text.iterrows():
                table_row = "<tr>"
                for value in row:
                    table_row += "<td>{}</td>".format(value)
                table_row += "</tr>"
                table_rows.append(table_row)
                # Render template dengan data yang disiapkan
            return render_template('uploaddata.html', table_rows=table_rows)
        else:
            return render_template('uploaddata.html')

    elif request.method == 'POST':
        if 'file' not in request.files:
            return redirect(request.url)

        file = request.files['file']

        if file.filename == '':
            return redirect(request.url)

        if file and allowed_file(file.filename):
            file.filename = "dataset.csv"
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename))

            # Reload the data after saving the file
            text = pandas.read_csv('uploads/dataset.csv', encoding='latin-1')

            return render_template('uploaddata.html', tables=[text.to_html()])

@app.route('/delete/<int:index>', methods=['GET'])
def delete_data(index):
    if os.path.exists('uploads/dataset.csv'):
        text = pandas.read_csv('uploads/dataset.csv', encoding='latin-1')
        text.drop(index=index-1, inplace=True)  # Hapus baris sesuai dengan indeks yang dipilih
        text.to_csv('uploads/dataset.csv', index=False)  # Simpan kembali file CSV tanpa baris yang dihapus
    return redirect('/uploaddata')


@app.route('/preprocess', methods=['GET', 'POST'])
def preprocess():
    if os.path.exists('uploads/dataset_stemmed.csv'):
        # Baca data CSV dan ambil 100 baris pertama
        text = pandas.read_csv('uploads/dataset_stemmed.csv', encoding='latin-1').head(100)
        # Inisialisasi list untuk menyimpan baris tabel HTML
        table_rows = []
        # Konversi setiap baris DataFrame menjadi baris HTML dan tambahkan ke list table_rows
        for index, row in text.iterrows():
            table_row = "<tr>"
            for value in row:
                table_row += "<td>{}</td>".format(value)
            table_row += "</tr>"
            table_rows.append(table_row)
        # Render template dengan data yang disiapkan
        return render_template('preprocessing.html', table_rows=table_rows)
    else:
        return render_template('preprocessing.html')

# preprocess data yg baru
def remove_punctuation(text):
    # Happy Emoticons
    emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', ':d', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

    # Sad Emoticons
    emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

    # All emoticons (happy + sad)
    emoticons = emoticons_happy.union(emoticons_sad)

    text = ' '.join([word for word in text.split() if word not in emoticons])

    text = re.sub(r'@[\w]*', ' ', text)

    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', text)

    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)

    text = re.sub(r'^RT[\s]+', ' ', text)

    text = text.lower()

    text = re.sub(r'[^\w\s]+', ' ', text)

    text = re.sub(r'[0-9]+', ' ', text)

    text = re.sub(r'_', ' ', text)

    text = re.sub(r'\$\w*', ' ', text)

    return text

# def tokenize(text):
# #   text = str(text)
#   tokens = text.split()
#   return tokens

def normalize_text(text):
    # Load slang-formal mapping data
    slang_formal_data = pd.read_csv('slang_formal_mapping.csv')

    # Create a dictionary from slang to formal words
    slang_formal_dict = dict(zip(slang_formal_data['slang'], slang_formal_data['formal']))

    # Convert text to string if it's not already a string
    text = str(text)

    # Split text into words
    words = text.split()

    # Normalize each word using the slang-formal dictionary
    normalized_words = [slang_formal_dict.get(word, word) for word in words]

    # Join normalized words back into text
    normalized_text = ' '.join(normalized_words)

    return normalized_text


def remove_stopwords(text):
    return ([word for word in text if word not in stp])
    # return text

def stem_text(text):
    text = ' '.join([stemmer.stem(word) for word in text])
    return text


@app.route('/preprocessing', methods=['GET', 'POST'])
def preprocessing():
    # Membaca data dari file CSV
    text = pandas.read_csv('uploads/dataset.csv', encoding='latin-1')

    # Lakukan preprocessing pada teks
    text['Text'] = text['Text'].apply(lambda x: remove_punctuation(x))

    # Menyimpan hasil preprocessing ke dalam file CSV
    text.to_csv('uploads/dataset_clear.csv', index=False)

    # Membaca data yang sudah di-preprocessing dari file CSV
    text = pandas.read_csv('uploads/dataset_clear.csv', encoding='latin-1')

    # Normalisasi teks
    text['Normalisasi'] = text['Text'].apply(normalize_text)

    # # Menyimpan hasil preprocessing ke dalam file CSV
    # text.to_csv('uploads/normalisasi.csv', index=False)

    # Membaca data yang sudah di-normalisasi dari file CSV
    # text = pandas.read_csv('uploads/normalisasi.csv', encoding='latin-1')

    # Tokenisasi teks dan tambahkan hasilnya ke dalam kolom baru 'Tokenized_Text'
    text['Tokenized_Text'] = text['Normalisasi'].apply(lambda x: word_tokenize(str(x)))
    # Stopwords
    text['remove_stopwords'] = text['Tokenized_Text'].apply(lambda x: remove_stopwords((x)))
    # Stemming teks menggunakan PorterStemmer
    # stemmer = PorterStemmer()
    text['Stemmed_Text'] = text['remove_stopwords'].apply(lambda x: stem_text((x)))

    text.drop(['remove_stopwords'], axis=1, inplace=True)
    # Menyimpan hasil stemming ke dalam file Excel
    text.to_csv('uploads/dataset_stemmed.csv', index=False)
    text = pandas.read_csv('uploads/dataset_stemmed.csv', encoding='latin-1').head(100)
        # Inisialisasi list untuk menyimpan baris tabel HTML
    table_rows = []
        # Konversi setiap baris DataFrame menjadi baris HTML dan tambahkan ke list table_rows
    for index, row in text.iterrows():
        table_row = "<tr>"
        for value in row:
            table_row += "<td>{}</td>".format(value)
        table_row += "</tr>"
        table_rows.append(table_row)
        # Render template dengan data yang disiapkan
    text.drop(['Normalisasi','Tokenized_Text'], axis=1, inplace=True)
    # Menyimpan hasil preprocessing ke dalam file CSV
    text.to_csv('uploads/stem.csv', index=False)
    return render_template('preprocessing.html', table_rows=table_rows)

@app.route('/tfidfpage', methods=['GET', 'POST'])
def tfidfpage():
    text_df = pandas.read_csv('uploads/dataset_stemmed.csv', encoding='latin-1')

    # Menghapus baris dengan nilai NaN
    text_df.dropna(axis=0, inplace=True)

    # Mengambil kolom 'Stemmed_Text' sebagai list teks
    texts = text_df['Stemmed_Text'].tolist()

    # Menghitung TF-IDF untuk dokumen yang tersedia
    tfidf_dict = calculate_tfidf(texts)

    # Kirim hasil TF-IDF ke template HTML
    return render_template('tfidf.html', tfidf_dict=tfidf_dict, total=len(texts))

# @app.route('/normalisasi', methods=['GET', 'POST'])
# def normalisasi():
#         if os.path.exists('uploads/normalisasi.csv'):
#             text = pandas.read_csv('uploads/normalisasi.csv', encoding='latin-1').head(10)
#             return render_template('normalisasi.html', tables=[text.to_html()])
#         else:
#             return render_template('normalisasi.html')

# @app.route('/normalisasing', methods=['GET', 'POST'])
# def normalisasing():
#     text = pandas.read_csv('uploads/dataset_clear.csv', encoding='latin-1')
#     text['Text'] = text['Text'].apply(lambda x: normalize_text(x))
#     text.to_csv('uploads/normalisasi.csv', index=False, header=True)
#     return render_template('normalisasi.html', tables=[text.to_html(classes='table table-bordered', table_id='dataTable')])


def calculate_tfidf(texts):
    # Menghitung Term Frequency (TF) untuk setiap term dalam setiap dokumen
    tf_dict = {}
    doc_terms = {}  # Mengumpulkan semua term untuk setiap dokumen
    for idx, text in enumerate(texts):
        terms = text.split() #memisahkan kalimat menjadi kata
        term_count = len(terms) #menghitung banyak kata
        doc_terms[idx] = terms  # Simpan terms untuk dokumen ini
        for term in terms:
            if term not in tf_dict: #menyimpan data jika jika di tf dict gaada
                tf_dict[term] = {}
            if idx not in tf_dict[term]:
                tf_dict[term][idx] = 0  #idx = index
            tf_dict[term][idx] += 1 / term_count  # Menghitung TF

    # Menghitung Inverse Document Frequency (IDF) untuk setiap term
    doc_count = len(texts) #menghitung documen
    idf_dict = {}
    for term in tf_dict:
        doc_freq = len(tf_dict[term])
        idf_dict[term] = math.log(doc_count / (doc_freq + 1))  # Menghitung IDF

    # Menghitung TF-IDF untuk setiap term dalam setiap dokumen
    tfidf_dict = {}
    for term in tf_dict:
        tfidf_dict[term] = {}
        for doc_idx in tf_dict[term]:
            doc_terms_str = ', '.join(doc_terms[doc_idx])  # Gabungkan terms ke dalam satu string
            if doc_terms_str not in tfidf_dict[term]:
                tfidf_dict[term][doc_terms_str] = 0
            tfidf_dict[term][doc_terms_str] += tf_dict[term][doc_idx] * idf_dict[term]

    return tfidf_dict

# TF_IDF BAWAAN
def data(text):
    text['Labels'] = text['Labels'].map({'positif': 1, 'negatif': 0})
    X = text['Stemmed_Text']
    y = text['Labels']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


@app.route('/tfidf', methods=['GET', 'POST'])
def tfidf():
    model_path = 'uploads/vectorizer.model'
    if os.path.exists(model_path):
        os.remove(model_path)
    text = pandas.read_csv('uploads/dataset_stemmed.csv', encoding='latin-1')
    text.dropna(axis=0, inplace=True)  # Membersihkan nilai np.nan

    positif, negatif= text['Labels'].value_counts()
    total = positif + negatif

    X_train, X_test, y_train, y_test = data(text)

    # Inisialisasi vektorisator TF-IDF
    vectorizer = TfidfVectorizer()

    # Lakukan vektorisasi TF-IDF pada data teks yang telah dibersihkan
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Menyimpan vektorisator ke disk
    pickle.dump(vectorizer, open('uploads/vectorizer.model','wb'))

    text_df = pandas.read_csv('uploads/dataset_stemmed.csv', encoding='latin-1')

    # Menghapus baris dengan nilai NaN
    text_df.dropna(axis=0, inplace=True)

    # Mengambil kolom 'Normalisasi' sebagai list teks
    texts = text_df['Stemmed_Text'].tolist()

    # Menghitung TF-IDF untuk dokumen yang tersedia
    tfidf_dict = calculate_tfidf(texts)
    # Kirim hasil TF-IDF ke template HTML
    return render_template('tfidf.html', tfidf_dict=tfidf_dict, total=len(texts))


@app.route('/klasifikasisvm1', methods=['GET', 'POST'])
def klasifikasisvm1():

    return render_template ('klasifikasisvm.html')


@app.route('/klasifikasisvm', methods=['GET', 'POST'])
def klasifikasisvm():
    import pickle
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.svm import SVC
    from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, accuracy_score
    linear_path = 'uploads/linear.model'
    if os.path.exists(linear_path):
        os.remove(linear_path)
    rbf_path = 'uploads/rbf.model'
    if os.path.exists(rbf_path):
        os.remove(rbf_path)
    # Loading model to compare the results
    vectorizer = pickle.load(open('uploads/vectorizer.model','rb'))

    text = pandas.read_csv('uploads/dataset_stemmed.csv', encoding='latin-1')
    text.dropna(axis=0, inplace=True)  # Membersihkan nilai np.nan

    X_train, X_test, y_train, y_test = data(text)

    # Lakukan vektorisasi TF-IDF pada data teks yang telah dibersihkan
    X_train = vectorizer.transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Process of making models Klasifikasi SVM RBF
    rbf = SVC(kernel="rbf")
    rbf.fit(X_train, y_train)
    rbf_pred = rbf.predict(X_test)

    # Saving models to disk
    # pickle.dump(linear, open('uploads/linear.model','wb'))
    pickle.dump(rbf, open('uploads/rbf.model','wb'))


    # Calculating evaluation metrics for rbf kernel
    f1_score_rbf = round(f1_score(y_test, rbf_pred)*100)
    accuracy_score_rbf = round(accuracy_score(y_test, rbf_pred) * 100)  # Convert to percentage
    precision_score_rbf = round(precision_score(y_test, rbf_pred) *100)   # Convert to percentage
    recall_score_rbf = round(recall_score(y_test, rbf_pred)* 100)  # Convert to percentage
    tn_rbf, fp_rbf, fn_rbf, tp_rbf = confusion_matrix(y_test, rbf_pred).ravel()

    return render_template ('klasifikasisvm.html',f1_score_rbf=f1_score_rbf, accuracy_score_rbf=accuracy_score_rbf, precision_score_rbf=precision_score_rbf,
    recall_score_rbf=recall_score_rbf, tn_rbf=tn_rbf, fp_rbf=fp_rbf, fn_rbf=fn_rbf, tp_rbf=tp_rbf)


@app.route('/tesmodel1', methods=['GET', 'POST'])
def tesmodel1():
    results = []
    with open('results.txt', 'r') as file:
        for line in file:
            original_text, preprocessed_text, sentiment = line.strip().split('\t')
            results.append({'original_text': original_text, 'preprocessed_text': preprocessed_text, 'sentiment': sentiment})

        # Reverse the results list
    results.reverse()
    return render_template ('tesmodel.html', results=results)


@app.route('/tesmodel', methods=['GET', 'POST'])
def tesmodel():
    # Loading model to compare the results
    model = pickle.load(open('uploads/rbf.model','rb'))
    vectorizer = pickle.load(open('uploads/vectorizer.model','rb'))

    text = request.form['text']
    original_text = text

    hasilprepro = preprocess_data(text)
    hasiltfidf = vectorizer.transform([hasilprepro])

    # cek prediksi dari kalimat
    hasilsvm = model.predict(hasiltfidf)
    if hasilsvm == 0:
        hasilsvm = 'NEGATIF'
    else:
        hasilsvm = 'POSITIF'

    # Save results to a text file in tabular format
    with open('results.txt', 'a') as file:
        file.write(f"{original_text}\t{hasilprepro}\t{hasilsvm}\n")

    # Read the contents of the results.txt file and pass them to the template
    results = []
    with open('results.txt', 'r') as file:
        for line in file:
            original_text, preprocessed_text, sentiment = line.strip().split('\t')
            results.append({'original_text': original_text, 'preprocessed_text': preprocessed_text, 'sentiment': sentiment})

    # Reverse the results list
    results.reverse()

    return render_template('tesmodel.html', results=results)