TIF_NGANJUK_E41212241/app.py

471 lines
24 KiB
Python

import joblib
import pandas as pd
import matplotlib.pyplot as plt
import io
import base64
from flask import Flask, render_template, request
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import sent_tokenize
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from naive_bayes import NaiveBayes
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
app = Flask(__name__)
# Load models, vectorizers, and selectors for both balanced and unbalanced
instagram_model_unbalanced = joblib.load('models/unbalancing/Instagram_model (1).joblib')
instagram_vectorizer_unbalanced = joblib.load('models/unbalancing/Instagram_vectorizer (1).joblib')
instagram_selector_unbalanced = joblib.load('models/unbalancing/Instagram_selector (1).joblib')
instagram_model_balanced = joblib.load('models/balancing/Instagram_model (2).joblib')
instagram_vectorizer_balanced = joblib.load('models/balancing/Instagram_vectorizer (2).joblib')
instagram_selector_balanced = joblib.load('models/balancing/Instagram_selector (2).joblib')
twitter_model_unbalanced = joblib.load('models/unbalancing/Twitter_model (1).joblib')
twitter_vectorizer_unbalanced = joblib.load('models/unbalancing/Twitter_vectorizer (1).joblib')
twitter_selector_unbalanced = joblib.load('models/unbalancing/Twitter_selector (1).joblib')
twitter_model_balanced = joblib.load('models/balancing/Twitter_model (2).joblib')
twitter_vectorizer_balanced = joblib.load('models/balancing/Twitter_vectorizer (2).joblib')
twitter_selector_balanced = joblib.load('models/balancing/Twitter_selector (2).joblib')
# Function to preprocess and predict sentiment for Instagram
def predict_instagram_comment(comment, balanced=False):
if balanced:
comment_vect = instagram_vectorizer_balanced.transform([comment])
comment_selected = instagram_selector_balanced.transform(comment_vect)
prediction = instagram_model_balanced.predict(comment_selected)
else:
comment_vect = instagram_vectorizer_unbalanced.transform([comment])
comment_selected = instagram_selector_unbalanced.transform(comment_vect)
prediction = instagram_model_unbalanced.predict(comment_selected)
return prediction[0]
# Function to preprocess and predict sentiment for Twitter
def predict_twitter_comment(comment, balanced=False):
if balanced:
comment_vect = twitter_vectorizer_balanced.transform([comment])
comment_selected = twitter_selector_balanced.transform(comment_vect)
prediction = twitter_model_balanced.predict(comment_selected)
else:
comment_vect = twitter_vectorizer_unbalanced.transform([comment])
comment_selected = twitter_selector_unbalanced.transform(comment_vect)
prediction = twitter_model_unbalanced.predict(comment_selected)
return prediction[0]
# Route for home page
@app.route('/')
def home():
return render_template('index.html')
# Route for prediction
@app.route('/predict', methods=['GET', 'POST'])
def predict():
if request.method == 'POST':
comment = request.form['comment']
platform = request.form['platform']
balanced = 'balanced' in request.form
# Cek apakah komentar kosong
if not comment:
error_message = "Komentar tidak boleh kosong!"
return render_template('predict.html', error_message=error_message)
if platform == 'Instagram':
sentiment = predict_instagram_comment(comment, balanced)
else:
sentiment = predict_twitter_comment(comment, balanced)
return render_template('predict.html', result=sentiment)
return render_template('predict.html')
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
@app.route('/accuracy')
def accuracy():
# Load the testing data for both balanced and unbalanced
instagram_test_balanced = pd.read_csv('data/balancing/instagram/testing_data.csv')
instagram_train_balanced = pd.read_csv('data/balancing/instagram/training_data.csv')
instagram_test_unbalanced = pd.read_csv('data/unbalancing/instagram/testing_data.csv')
instagram_train_unbalanced = pd.read_csv('data/unbalancing/instagram/training_data.csv')
twitter_test_balanced = pd.read_csv('data/balancing/twitter/testing_data.csv')
twitter_train_balanced = pd.read_csv('data/balancing/twitter/training_data.csv')
twitter_test_unbalanced = pd.read_csv('data/unbalancing/twitter/testing_data.csv')
twitter_train_unbalanced = pd.read_csv('data/unbalancing/twitter/training_data.csv')
# Instagram Metrics (Unbalanced)
instagram_test_vect_unbalanced = instagram_vectorizer_unbalanced.transform(instagram_test_unbalanced['preprocessing_comment'])
instagram_test_selected_unbalanced = instagram_selector_unbalanced.transform(instagram_test_vect_unbalanced)
instagram_predictions_unbalanced = instagram_model_unbalanced.predict(instagram_test_selected_unbalanced)
# Calculate Metrics for Instagram (Unbalanced)
instagram_accuracy_unbalanced = accuracy_score(instagram_test_unbalanced['Sentiment'], instagram_predictions_unbalanced)
instagram_precision_unbalanced = precision_score(instagram_test_unbalanced['Sentiment'], instagram_predictions_unbalanced, average='weighted', labels=['positif', 'negatif', 'netral'])
instagram_recall_unbalanced = recall_score(instagram_test_unbalanced['Sentiment'], instagram_predictions_unbalanced, average='weighted', labels=['positif', 'negatif', 'netral'])
instagram_f1_unbalanced = f1_score(instagram_test_unbalanced['Sentiment'], instagram_predictions_unbalanced, average='weighted', labels=['positif', 'negatif', 'netral'])
# Instagram Metrics (Balanced)
instagram_test_vect_balanced = instagram_vectorizer_balanced.transform(instagram_test_balanced['preprocessing_comment'])
instagram_test_selected_balanced = instagram_selector_balanced.transform(instagram_test_vect_balanced)
instagram_predictions_balanced = instagram_model_balanced.predict(instagram_test_selected_balanced)
# Calculate Metrics for Instagram (Balanced)
instagram_accuracy_balanced = accuracy_score(instagram_test_balanced['Sentiment'], instagram_predictions_balanced)
instagram_precision_balanced = precision_score(instagram_test_balanced['Sentiment'], instagram_predictions_balanced, average='weighted', labels=['positif', 'negatif', 'netral'])
instagram_recall_balanced = recall_score(instagram_test_balanced['Sentiment'], instagram_predictions_balanced, average='weighted', labels=['positif', 'negatif', 'netral'])
instagram_f1_balanced = f1_score(instagram_test_balanced['Sentiment'], instagram_predictions_balanced, average='weighted', labels=['positif', 'negatif', 'netral'])
# Twitter Metrics (Unbalanced)
twitter_test_vect_unbalanced = twitter_vectorizer_unbalanced.transform(twitter_test_unbalanced['preprocessing_comment'])
twitter_test_selected_unbalanced = twitter_selector_unbalanced.transform(twitter_test_vect_unbalanced)
twitter_predictions_unbalanced = twitter_model_unbalanced.predict(twitter_test_selected_unbalanced)
# Calculate Metrics for Twitter (Unbalanced)
twitter_accuracy_unbalanced = accuracy_score(twitter_test_unbalanced['Sentiment'], twitter_predictions_unbalanced)
twitter_precision_unbalanced = precision_score(twitter_test_unbalanced['Sentiment'], twitter_predictions_unbalanced, average='weighted', labels=['positif', 'negatif', 'netral'])
twitter_recall_unbalanced = recall_score(twitter_test_unbalanced['Sentiment'], twitter_predictions_unbalanced, average='weighted', labels=['positif', 'negatif', 'netral'])
twitter_f1_unbalanced = f1_score(twitter_test_unbalanced['Sentiment'], twitter_predictions_unbalanced, average='weighted', labels=['positif', 'negatif', 'netral'])
# Twitter Metrics (Balanced)
twitter_test_vect_balanced = twitter_vectorizer_balanced.transform(twitter_test_balanced['preprocessing_comment'])
twitter_test_selected_balanced = twitter_selector_balanced.transform(twitter_test_vect_balanced)
twitter_predictions_balanced = twitter_model_balanced.predict(twitter_test_selected_balanced)
# Calculate Metrics for Twitter (Balanced)
twitter_accuracy_balanced = accuracy_score(twitter_test_balanced['Sentiment'], twitter_predictions_balanced)
twitter_precision_balanced = precision_score(twitter_test_balanced['Sentiment'], twitter_predictions_balanced, average='weighted', labels=['positif', 'negatif', 'netral'])
twitter_recall_balanced = recall_score(twitter_test_balanced['Sentiment'], twitter_predictions_balanced, average='weighted', labels=['positif', 'negatif', 'netral'])
twitter_f1_balanced = f1_score(twitter_test_balanced['Sentiment'], twitter_predictions_balanced, average='weighted', labels=['positif', 'negatif', 'netral'])
# Confusion Matrices
instagram_conf_matrix_unbalanced = confusion_matrix(instagram_test_unbalanced['Sentiment'], instagram_predictions_unbalanced, labels=['positif', 'negatif', 'netral'])
instagram_conf_matrix_balanced = confusion_matrix(instagram_test_balanced['Sentiment'], instagram_predictions_balanced, labels=['positif', 'negatif', 'netral'])
twitter_conf_matrix_unbalanced = confusion_matrix(twitter_test_unbalanced['Sentiment'], twitter_predictions_unbalanced, labels=['positif', 'negatif', 'netral'])
twitter_conf_matrix_balanced = confusion_matrix(twitter_test_balanced['Sentiment'], twitter_predictions_balanced, labels=['positif', 'negatif', 'netral'])
# Calculate manual metrics per class based on confusion matrices
# For Instagram Unbalanced
instagram_manual_metrics_unbalanced = calculate_metrics_from_conf_matrix(instagram_conf_matrix_unbalanced)
# For Instagram Balanced
instagram_manual_metrics_balanced = calculate_metrics_from_conf_matrix(instagram_conf_matrix_balanced)
# For Twitter Unbalanced
twitter_manual_metrics_unbalanced = calculate_metrics_from_conf_matrix(twitter_conf_matrix_unbalanced)
# For Twitter Balanced
twitter_manual_metrics_balanced = calculate_metrics_from_conf_matrix(twitter_conf_matrix_balanced)
return render_template(
'accuracy.html',
# Accuracy values
instagram_accuracy_unbalanced=instagram_accuracy_unbalanced * 100, # Convert to percentage
instagram_accuracy_balanced=instagram_accuracy_balanced * 100,
twitter_accuracy_unbalanced=twitter_accuracy_unbalanced * 100,
twitter_accuracy_balanced=twitter_accuracy_balanced * 100,
# Precision values
instagram_precision_unbalanced=instagram_precision_unbalanced,
instagram_precision_balanced=instagram_precision_balanced,
twitter_precision_unbalanced=twitter_precision_unbalanced,
twitter_precision_balanced=twitter_precision_balanced,
# Recall values
instagram_recall_unbalanced=instagram_recall_unbalanced,
instagram_recall_balanced=instagram_recall_balanced,
twitter_recall_unbalanced=twitter_recall_unbalanced,
twitter_recall_balanced=twitter_recall_balanced,
# F1 values
instagram_f1_unbalanced=instagram_f1_unbalanced,
instagram_f1_balanced=instagram_f1_balanced,
twitter_f1_unbalanced=twitter_f1_unbalanced,
twitter_f1_balanced=twitter_f1_balanced,
# Confusion Matrices
instagram_conf_matrix_unbalanced=instagram_conf_matrix_unbalanced,
instagram_conf_matrix_balanced=instagram_conf_matrix_balanced,
twitter_conf_matrix_unbalanced=twitter_conf_matrix_unbalanced,
twitter_conf_matrix_balanced=twitter_conf_matrix_balanced,
# Manual metrics per class
instagram_manual_metrics_unbalanced=instagram_manual_metrics_unbalanced,
instagram_manual_metrics_balanced=instagram_manual_metrics_balanced,
twitter_manual_metrics_unbalanced=twitter_manual_metrics_unbalanced,
twitter_manual_metrics_balanced=twitter_manual_metrics_balanced
)
def calculate_metrics_from_conf_matrix(conf_matrix):
"""
Calculate precision, recall, and F1-score for each class from confusion matrix
Args:
conf_matrix: Confusion matrix in format [positif, negatif, netral]
Returns:
Dictionary with metrics for each class
"""
classes = ['positif', 'negatif', 'netral']
metrics = {}
for i, class_name in enumerate(classes):
# True Positives: diagonal element for this class
tp = conf_matrix[i, i]
# False Positives: sum of column i except diagonal element
fp = sum(conf_matrix[:, i]) - tp
# False Negatives: sum of row i except diagonal element
fn = sum(conf_matrix[i, :]) - tp
# True Negatives: sum of all elements except row i and column i
tn = conf_matrix.sum() - (tp + fp + fn)
# Calculate metrics
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
metrics[class_name] = {
'precision': precision,
'recall': recall,
'f1': f1,
'tp': int(tp), # Convert to int for better display in template
'fp': int(fp),
'fn': int(fn),
'tn': int(tn)
}
return metrics
# Function to plot and display sentiment distribution
def plot_sentiment_distribution(sentiment_count, title, ax):
ax.bar(sentiment_count.index, sentiment_count.values, color='skyblue')
ax.set_title(title)
ax.set_xlabel('Sentiment')
ax.set_ylabel('Count')
# Add numbers on top of the bars
for i, v in enumerate(sentiment_count.values):
ax.text(i, v + 5, str(v), ha='center', va='bottom', fontsize=12, color='black')
@app.route('/preprocessing', methods=['GET', 'POST'])
def preprocessing():
result = None # Initialize result variable
error = None # Initialize error variable
page = request.args.get('page', 1, type=int) # Get page number from query string, default 1
# Default values for platform, process_type, and per_page
platform = request.args.get('platform', 'Instagram') # Default to 'Instagram'
process_type = request.args.get('process_type', 'Normalisasi') # Default to 'Normalisasi'
per_page = int(request.args.get('per_page', 10)) # Default to 10 per page
# Initialize pagination as a safe empty dict
pagination = {
'total': 0,
'per_page': per_page,
'page': page,
'pages': 1
}
if request.method == 'POST':
# Get form values
platform = request.form.get('platform', platform) # Use default if not set
process_type = request.form.get('process_type', process_type) # Use default if not set
per_page = int(request.form.get('per_page', per_page)) # Use default if not set
# Check if platform or process_type are missing
if not platform or not process_type:
error = "Platform or process type is missing!" # Set error message
return render_template('preprocessing.html', error=error)
try:
# Load the corresponding data based on platform and process type
if platform == 'Instagram':
if process_type == 'Normalisasi':
data = pd.read_csv('dataset/olahdata/preprocessing_ig_normalisasi.csv')
elif process_type == 'Tokenisasi':
data = pd.read_csv('dataset/olahdata/preprocessing_ig_tokenisasi.csv')
elif process_type == 'Stopword':
data = pd.read_csv('dataset/olahdata/preprocessing_ig_stopword.csv')
elif process_type == 'Stemming':
data = pd.read_csv('dataset/olahdata/preprocessing_ig_stemming.csv')
elif platform == 'Twitter':
if process_type == 'Normalisasi':
data = pd.read_csv('dataset/olahdata/preprocessing_twitter_normalisasi.csv')
elif process_type == 'Tokenisasi':
data = pd.read_csv('dataset/olahdata/preprocessing_twitter_tokenisasi.csv')
elif process_type == 'Stopword':
data = pd.read_csv('dataset/olahdata/preprocessing_twitter_stopword.csv')
elif process_type == 'Stemming':
data = pd.read_csv('dataset/olahdata/preprocessing_twitter_stemming.csv')
# Check if data is loaded and contains the expected columns
if data.empty:
error = "No data found in the selected file."
return render_template('preprocessing.html', error=error)
before_column = 'before_' + process_type.lower()
after_column = 'after_' + process_type.lower()
# Check if the columns exist in the data
if before_column not in data.columns or after_column not in data.columns:
error = f"Columns {before_column} or {after_column} not found in the data."
return render_template('preprocessing.html', error=error)
# Pagination logic: handle the data per page
total_data = len(data)
before_data = data[before_column].iloc[(page-1)*per_page:page*per_page]
after_data = data[after_column].iloc[(page-1)*per_page:page*per_page]
# Update pagination dictionary
pagination = {
'total': total_data,
'per_page': per_page,
'page': page,
'pages': (total_data // per_page) + (1 if total_data % per_page > 0 else 0)
}
# Assign result
result = {
'before': before_data,
'after': after_data,
}
except Exception as e:
error = f"An error occurred while processing data: {str(e)}" # Handle any data loading or processing errors
return render_template('preprocessing.html', error=error)
return render_template('preprocessing.html', result=result, error=error, platform=platform, process_type=process_type, per_page=per_page, page=page, pagination=pagination)
# Function to generate sentiment distribution bar chart
def generate_sentiment_distribution(data):
sentiment_counts = data['Sentiment'].value_counts()
# Create the plot
fig, ax = plt.subplots()
sentiment_counts.plot(kind='bar', ax=ax, color=['#ff6666', '#66b3ff', '#99ff99'])
ax.set_title('Sentiment Distribution')
ax.set_xlabel('Sentiment')
ax.set_ylabel('Count')
# Save the plot to a BytesIO object
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
# Encode the image as base64 for embedding in the HTML
return base64.b64encode(img.getvalue()).decode()
# Function to generate WordCloud
def generate_wordcloud(data, sentiment_type):
sentiment_data = data[data['Sentiment'] == sentiment_type]['Cleaned Comment']
wordcloud = WordCloud(width=800, height=400).generate(' '.join(sentiment_data))
img_wc = io.BytesIO()
wordcloud.to_image().save(img_wc, format='PNG')
img_wc.seek(0)
return base64.b64encode(img_wc.getvalue()).decode()
# Function to generate WordCloud for Total Sentiment
def generate_total_wordcloud(data):
# Combine all sentiment types (positive, negative, and neutral) into a single string
total_sentiment_data = " ".join(data['Cleaned Comment']) # Join all cleaned comments
wordcloud = WordCloud(width=800, height=400).generate(total_sentiment_data)
img_wc = io.BytesIO()
wordcloud.to_image().save(img_wc, format='PNG')
img_wc.seek(0)
return base64.b64encode(img_wc.getvalue()).decode()
@app.route('/visualization')
def visualization():
# Load data
instagram_data_unbalanced = pd.read_csv('dataset/before_softbal_label_ig.csv')
instagram_data_balanced = pd.read_csv('dataset/label_ig.csv')
twitter_data_unbalanced = pd.read_csv('dataset/before_softbal_label_twit.csv')
twitter_data_balanced = pd.read_csv('dataset/label_twit.csv')
# Calculate sentiment counts for Instagram and Twitter
instagram_sentiment_count_unbalanced = instagram_data_unbalanced['Sentiment'].value_counts()
instagram_sentiment_count_balanced = instagram_data_balanced['Sentiment'].value_counts()
twitter_sentiment_count_unbalanced = twitter_data_unbalanced['Sentiment'].value_counts()
twitter_sentiment_count_balanced = twitter_data_balanced['Sentiment'].value_counts()
# Generate WordClouds for Instagram (Balanced)
instagram_wordcloud_b64_pos_balanced = generate_wordcloud(instagram_data_balanced, 'positif')
instagram_wordcloud_b64_neg_balanced = generate_wordcloud(instagram_data_balanced, 'negatif')
instagram_wordcloud_b64_neu_balanced = generate_wordcloud(instagram_data_balanced, 'netral')
instagram_wordcloud_b64_total_balanced = generate_total_wordcloud(instagram_data_balanced)
# Generate WordClouds for Instagram (Unbalanced)
instagram_wordcloud_b64_pos_unbalanced = generate_wordcloud(instagram_data_unbalanced, 'positif')
instagram_wordcloud_b64_neg_unbalanced = generate_wordcloud(instagram_data_unbalanced, 'negatif')
instagram_wordcloud_b64_neu_unbalanced = generate_wordcloud(instagram_data_unbalanced, 'netral')
instagram_wordcloud_b64_total_unbalanced = generate_total_wordcloud(instagram_data_unbalanced)
# Generate WordClouds for Twitter (Balanced)
twitter_wordcloud_b64_pos_balanced = generate_wordcloud(twitter_data_balanced, 'positif')
twitter_wordcloud_b64_neg_balanced = generate_wordcloud(twitter_data_balanced, 'negatif')
twitter_wordcloud_b64_neu_balanced = generate_wordcloud(twitter_data_balanced, 'netral')
twitter_wordcloud_b64_total_balanced = generate_total_wordcloud(twitter_data_balanced)
# Generate WordClouds for Twitter (Unbalanced)
twitter_wordcloud_b64_pos_unbalanced = generate_wordcloud(twitter_data_unbalanced, 'positif')
twitter_wordcloud_b64_neg_unbalanced = generate_wordcloud(twitter_data_unbalanced, 'negatif')
twitter_wordcloud_b64_neu_unbalanced = generate_wordcloud(twitter_data_unbalanced, 'netral')
twitter_wordcloud_b64_total_unbalanced = generate_total_wordcloud(twitter_data_unbalanced)
# Generate Sentiment Distribution Diagrams (Instagram)
instagram_img_b64_unbalanced = generate_sentiment_distribution(instagram_data_unbalanced)
instagram_img_b64_balanced = generate_sentiment_distribution(instagram_data_balanced)
# Generate Sentiment Distribution Diagrams (Twitter)
twitter_img_b64_unbalanced = generate_sentiment_distribution(twitter_data_unbalanced)
twitter_img_b64_balanced = generate_sentiment_distribution(twitter_data_balanced)
# Render the HTML template and pass all the necessary data
return render_template(
'visualization.html',
instagram_img_b64_unbalanced=instagram_img_b64_unbalanced,
instagram_img_b64_balanced=instagram_img_b64_balanced,
twitter_img_b64_unbalanced=twitter_img_b64_unbalanced,
twitter_img_b64_balanced=twitter_img_b64_balanced,
instagram_wordcloud_b64_pos_balanced=instagram_wordcloud_b64_pos_balanced,
instagram_wordcloud_b64_neg_balanced=instagram_wordcloud_b64_neg_balanced,
instagram_wordcloud_b64_neu_balanced=instagram_wordcloud_b64_neu_balanced,
instagram_wordcloud_b64_total_balanced=instagram_wordcloud_b64_total_balanced,
instagram_wordcloud_b64_pos_unbalanced=instagram_wordcloud_b64_pos_unbalanced,
instagram_wordcloud_b64_neg_unbalanced=instagram_wordcloud_b64_neg_unbalanced,
instagram_wordcloud_b64_neu_unbalanced=instagram_wordcloud_b64_neu_unbalanced,
instagram_wordcloud_b64_total_unbalanced=instagram_wordcloud_b64_total_unbalanced,
twitter_wordcloud_b64_pos_balanced=twitter_wordcloud_b64_pos_balanced,
twitter_wordcloud_b64_neg_balanced=twitter_wordcloud_b64_neg_balanced,
twitter_wordcloud_b64_neu_balanced=twitter_wordcloud_b64_neu_balanced,
twitter_wordcloud_b64_total_balanced=twitter_wordcloud_b64_total_balanced,
twitter_wordcloud_b64_pos_unbalanced=twitter_wordcloud_b64_pos_unbalanced,
twitter_wordcloud_b64_neg_unbalanced=twitter_wordcloud_b64_neg_unbalanced,
twitter_wordcloud_b64_neu_unbalanced=twitter_wordcloud_b64_neu_unbalanced,
twitter_wordcloud_b64_total_unbalanced=twitter_wordcloud_b64_total_unbalanced,
instagram_sentiment_count_unbalanced=instagram_sentiment_count_unbalanced,
instagram_sentiment_count_balanced=instagram_sentiment_count_balanced,
twitter_sentiment_count_unbalanced=twitter_sentiment_count_unbalanced,
twitter_sentiment_count_balanced=twitter_sentiment_count_balanced
)
if __name__ == '__main__':
app.run(debug=True)