TIFNJK_E41221742_Renaldi-En.../utils.py

150 lines
5.8 KiB
Python

import pandas as pd
import numpy as np
import json
import h5py
import re
import pickle
import os
import streamlit as st
import tensorflow as tf
try:
from tensorflow.keras.utils import pad_sequences
except ImportError:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# ==============================================================================
# 1. KONFIGURASI GLOBAL
# ==============================================================================
MAX_SEQUENCE_LENGTH = 100
MODEL_PATH = 'model/Model_Sentiment_LSTM.h5'
TOKENIZER_JSON_PATH = 'model/tokenizer_sentiment.json'
TOKENIZER_PICKLE_PATH = 'model/tokenizer_sentiment.pickle'
# ==============================================================================
# 2. PATCHING MODEL
# ==============================================================================
def recursive_fix_config(config):
"""Memperbaiki konfigurasi model agar bisa dibaca di berbagai versi TF"""
if isinstance(config, list):
return [recursive_fix_config(x) for x in config]
if isinstance(config, dict):
if 'batch_shape' in config:
config['batch_input_shape'] = config.pop('batch_shape')
if 'dtype' in config:
if isinstance(config['dtype'], dict) or 'Policy' in str(config['dtype']):
config['dtype'] = 'float32'
for key, value in config.items():
config[key] = recursive_fix_config(value)
return config
# ==============================================================================
# 3. LOAD RESOURCES (MODEL & TOKENIZER)
# ==============================================================================
@st.cache_resource
def load_resources():
model = None
tokenizer = None
# --- A. LOAD MODEL ---
if not os.path.exists(MODEL_PATH):
st.error(f"❌ File model tidak ditemukan di: {MODEL_PATH}")
return None, None
try:
model = tf.keras.models.load_model(MODEL_PATH, compile=False)
except Exception:
try:
with h5py.File(MODEL_PATH, mode='r') as f:
model_config_str = f.attrs.get('model_config')
if isinstance(model_config_str, bytes):
model_config_str = model_config_str.decode('utf-8')
model_config_dict = json.loads(model_config_str)
fixed_config = recursive_fix_config(model_config_dict)
model = tf.keras.models.model_from_json(json.dumps(fixed_config))
model.load_weights(MODEL_PATH)
except Exception as e:
st.error(f"❌ Gagal memuat model: {e}")
return None, None
# --- B. LOAD TOKENIZER ---
try:
if os.path.exists(TOKENIZER_JSON_PATH):
with open(TOKENIZER_JSON_PATH, 'r', encoding='utf-8') as f:
content = f.read()
try:
parsed_json = json.loads(content)
if isinstance(parsed_json, str):
input_tokenizer = parsed_json
else:
input_tokenizer = json.dumps(parsed_json)
except:
input_tokenizer = content
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(input_tokenizer)
elif os.path.exists(TOKENIZER_PICKLE_PATH):
with open(TOKENIZER_PICKLE_PATH, 'rb') as handle:
tokenizer = pickle.load(handle)
else:
st.error("❌ File Tokenizer tidak ditemukan.")
return None, None
except Exception as e:
st.error(f"❌ Gagal memuat tokenizer: {e}")
return None, None
return model, tokenizer
# ==============================================================================
# 4. PREPROCESSING TEKS
# ==============================================================================
slang_dict = {
'bgt': 'banget', 'yg': 'yang', 'gak': 'tidak', 'ga': 'tidak',
'kalo': 'kalau', 'kl': 'kalau', 'dr': 'dari', 'krn': 'karena',
'jd': 'jadi', 'sdh': 'sudah', 'aja': 'saja', 'dgn': 'dengan',
'tdk': 'tidak', 'tp': 'tapi', 'sy': 'saya', 'utk': 'untuk',
'd': 'di', 'blm': 'belum', 'jgn': 'jangan', 'gw': 'saya',
'lo': 'kamu', 'sm': 'sama', 'tau': 'tahu', 'kpn': 'kapan',
'bs': 'bisa', 'lbh': 'lebih', 'kmrn': 'kemarin',
'nggak': 'tidak', 'enggak': 'tidak', 'gk': 'tidak',
'kaga': 'tidak', 'tak': 'tidak', 'g': 'tidak',
'bener': 'benar', 'bnr': 'benar', 'msh': 'masih',
'udah': 'sudah', 'sprt': 'seperti', 'opr': 'operasional',
'tlg': 'tolong', 'bkn': 'bukan', 'aq': 'aku', 'km': 'kamu', 'dlm': 'dalam'
}
def clean_text(text):
if not isinstance(text, str): return ""
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'@\w+', '', text)
text = re.sub(r'#\w+', '', text)
text = re.sub(r'\d+', '', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
words = text.split()
normalized_words = [slang_dict.get(w, w) for w in words]
return " ".join(normalized_words)
# ==============================================================================
# 5. PREDIKSI
# ==============================================================================
def predict_sentiment(text, model, tokenizer):
if not text or not model or not tokenizer:
return "Error", 0.0, [0, 0, 0], text
cleaned_text = clean_text(text)
seq = tokenizer.texts_to_sequences([cleaned_text])
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
prediction = model.predict(padded, verbose=0)[0]
labels = ['Negatif', 'Netral', 'Positif']
label_idx = np.argmax(prediction)
label = labels[label_idx]
confidence = prediction[label_idx] * 100
return label, confidence, prediction, cleaned_text