TIFNJK_E41221742_Renaldi-En.../utils.py

import pandas as pd
import numpy as np
import json
import h5py
import re
import pickle
import os
import streamlit as st
import tensorflow as tf

try:
    from tensorflow.keras.utils import pad_sequences
except ImportError:
    from tensorflow.keras.preprocessing.sequence import pad_sequences

# ==============================================================================
# 1. KONFIGURASI GLOBAL
# ==============================================================================
MAX_SEQUENCE_LENGTH = 100
MODEL_PATH = 'model/Model_Sentiment_LSTM.h5'
TOKENIZER_JSON_PATH = 'model/tokenizer_sentiment.json'
TOKENIZER_PICKLE_PATH = 'model/tokenizer_sentiment.pickle'

# ==============================================================================
# 2. PATCHING MODEL
# ==============================================================================
def recursive_fix_config(config):
    """Memperbaiki konfigurasi model agar bisa dibaca di berbagai versi TF"""
    if isinstance(config, list):
        return [recursive_fix_config(x) for x in config]
    if isinstance(config, dict):
        if 'batch_shape' in config:
            config['batch_input_shape'] = config.pop('batch_shape')
        if 'dtype' in config:
            if isinstance(config['dtype'], dict) or 'Policy' in str(config['dtype']):
                config['dtype'] = 'float32'
        for key, value in config.items():
            config[key] = recursive_fix_config(value)
    return config

# ==============================================================================
# 3. LOAD RESOURCES (MODEL & TOKENIZER)
# ==============================================================================
@st.cache_resource
def load_resources():
    model = None
    tokenizer = None

    # --- A. LOAD MODEL ---
    if not os.path.exists(MODEL_PATH):
        st.error(f"❌ File model tidak ditemukan di: {MODEL_PATH}")
        return None, None

    try:
        model = tf.keras.models.load_model(MODEL_PATH, compile=False)
    except Exception:
        try:
            with h5py.File(MODEL_PATH, mode='r') as f:
                model_config_str = f.attrs.get('model_config')
                if isinstance(model_config_str, bytes):
                    model_config_str = model_config_str.decode('utf-8')

                model_config_dict = json.loads(model_config_str)
                fixed_config = recursive_fix_config(model_config_dict)

                model = tf.keras.models.model_from_json(json.dumps(fixed_config))
                model.load_weights(MODEL_PATH)
        except Exception as e:
            st.error(f"❌ Gagal memuat model: {e}")
            return None, None

    # --- B. LOAD TOKENIZER ---
    try:
        if os.path.exists(TOKENIZER_JSON_PATH):
            with open(TOKENIZER_JSON_PATH, 'r', encoding='utf-8') as f:
                content = f.read()
                try:
                    parsed_json = json.loads(content)
                    if isinstance(parsed_json, str):
                        input_tokenizer = parsed_json
                    else:
                        input_tokenizer = json.dumps(parsed_json)
                except:
                    input_tokenizer = content
                tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(input_tokenizer)
        elif os.path.exists(TOKENIZER_PICKLE_PATH):
            with open(TOKENIZER_PICKLE_PATH, 'rb') as handle:
                tokenizer = pickle.load(handle)
        else:
            st.error("❌ File Tokenizer tidak ditemukan.")
            return None, None

    except Exception as e:
        st.error(f"❌ Gagal memuat tokenizer: {e}")
        return None, None

    return model, tokenizer

# ==============================================================================
# 4. PREPROCESSING TEKS
# ==============================================================================
slang_dict = {
    'bgt': 'banget', 'yg': 'yang', 'gak': 'tidak', 'ga': 'tidak',
    'kalo': 'kalau', 'kl': 'kalau', 'dr': 'dari', 'krn': 'karena',
    'jd': 'jadi', 'sdh': 'sudah', 'aja': 'saja', 'dgn': 'dengan',
    'tdk': 'tidak', 'tp': 'tapi', 'sy': 'saya', 'utk': 'untuk',
    'd': 'di', 'blm': 'belum', 'jgn': 'jangan', 'gw': 'saya',
    'lo': 'kamu', 'sm': 'sama', 'tau': 'tahu', 'kpn': 'kapan',
    'bs': 'bisa', 'lbh': 'lebih', 'kmrn': 'kemarin',
    'nggak': 'tidak', 'enggak': 'tidak', 'gk': 'tidak',
    'kaga': 'tidak', 'tak': 'tidak', 'g': 'tidak',
    'bener': 'benar', 'bnr': 'benar', 'msh': 'masih',
    'udah': 'sudah', 'sprt': 'seperti', 'opr': 'operasional',
    'tlg': 'tolong', 'bkn': 'bukan', 'aq': 'aku', 'km': 'kamu', 'dlm': 'dalam'
}

def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    normalized_words = [slang_dict.get(w, w) for w in words]
    return " ".join(normalized_words)

# ==============================================================================
# 5. PREDIKSI
# ==============================================================================
def predict_sentiment(text, model, tokenizer):
    if not text or not model or not tokenizer:
        return "Error", 0.0, [0, 0, 0], text

    cleaned_text = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned_text])

    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    prediction = model.predict(padded, verbose=0)[0]

    labels = ['Negatif', 'Netral', 'Positif']
    label_idx = np.argmax(prediction)
    label = labels[label_idx]
    confidence = prediction[label_idx] * 100

    return label, confidence, prediction, cleaned_text