import pandas as pd import numpy as np import json import h5py import re import pickle import os import streamlit as st import tensorflow as tf try: from tensorflow.keras.utils import pad_sequences except ImportError: from tensorflow.keras.preprocessing.sequence import pad_sequences # ============================================================================== # 1. KONFIGURASI GLOBAL # ============================================================================== MAX_SEQUENCE_LENGTH = 100 MODEL_PATH = 'model/Model_Sentiment_LSTM.h5' TOKENIZER_JSON_PATH = 'model/tokenizer_sentiment.json' TOKENIZER_PICKLE_PATH = 'model/tokenizer_sentiment.pickle' # ============================================================================== # 2. PATCHING MODEL # ============================================================================== def recursive_fix_config(config): """Memperbaiki konfigurasi model agar bisa dibaca di berbagai versi TF""" if isinstance(config, list): return [recursive_fix_config(x) for x in config] if isinstance(config, dict): if 'batch_shape' in config: config['batch_input_shape'] = config.pop('batch_shape') if 'dtype' in config: if isinstance(config['dtype'], dict) or 'Policy' in str(config['dtype']): config['dtype'] = 'float32' for key, value in config.items(): config[key] = recursive_fix_config(value) return config # ============================================================================== # 3. LOAD RESOURCES (MODEL & TOKENIZER) # ============================================================================== @st.cache_resource def load_resources(): model = None tokenizer = None # --- A. LOAD MODEL --- if not os.path.exists(MODEL_PATH): st.error(f"❌ File model tidak ditemukan di: {MODEL_PATH}") return None, None try: model = tf.keras.models.load_model(MODEL_PATH, compile=False) except Exception: try: with h5py.File(MODEL_PATH, mode='r') as f: model_config_str = f.attrs.get('model_config') if isinstance(model_config_str, bytes): model_config_str = model_config_str.decode('utf-8') model_config_dict = json.loads(model_config_str) fixed_config = recursive_fix_config(model_config_dict) model = tf.keras.models.model_from_json(json.dumps(fixed_config)) model.load_weights(MODEL_PATH) except Exception as e: st.error(f"❌ Gagal memuat model: {e}") return None, None # --- B. LOAD TOKENIZER --- try: if os.path.exists(TOKENIZER_JSON_PATH): with open(TOKENIZER_JSON_PATH, 'r', encoding='utf-8') as f: content = f.read() try: parsed_json = json.loads(content) if isinstance(parsed_json, str): input_tokenizer = parsed_json else: input_tokenizer = json.dumps(parsed_json) except: input_tokenizer = content tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(input_tokenizer) elif os.path.exists(TOKENIZER_PICKLE_PATH): with open(TOKENIZER_PICKLE_PATH, 'rb') as handle: tokenizer = pickle.load(handle) else: st.error("❌ File Tokenizer tidak ditemukan.") return None, None except Exception as e: st.error(f"❌ Gagal memuat tokenizer: {e}") return None, None return model, tokenizer # ============================================================================== # 4. PREPROCESSING TEKS # ============================================================================== slang_dict = { 'bgt': 'banget', 'yg': 'yang', 'gak': 'tidak', 'ga': 'tidak', 'kalo': 'kalau', 'kl': 'kalau', 'dr': 'dari', 'krn': 'karena', 'jd': 'jadi', 'sdh': 'sudah', 'aja': 'saja', 'dgn': 'dengan', 'tdk': 'tidak', 'tp': 'tapi', 'sy': 'saya', 'utk': 'untuk', 'd': 'di', 'blm': 'belum', 'jgn': 'jangan', 'gw': 'saya', 'lo': 'kamu', 'sm': 'sama', 'tau': 'tahu', 'kpn': 'kapan', 'bs': 'bisa', 'lbh': 'lebih', 'kmrn': 'kemarin', 'nggak': 'tidak', 'enggak': 'tidak', 'gk': 'tidak', 'kaga': 'tidak', 'tak': 'tidak', 'g': 'tidak', 'bener': 'benar', 'bnr': 'benar', 'msh': 'masih', 'udah': 'sudah', 'sprt': 'seperti', 'opr': 'operasional', 'tlg': 'tolong', 'bkn': 'bukan', 'aq': 'aku', 'km': 'kamu', 'dlm': 'dalam' } def clean_text(text): if not isinstance(text, str): return "" text = text.lower() text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) text = re.sub(r'@\w+', '', text) text = re.sub(r'#\w+', '', text) text = re.sub(r'\d+', '', text) text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() words = text.split() normalized_words = [slang_dict.get(w, w) for w in words] return " ".join(normalized_words) # ============================================================================== # 5. PREDIKSI # ============================================================================== def predict_sentiment(text, model, tokenizer): if not text or not model or not tokenizer: return "Error", 0.0, [0, 0, 0], text cleaned_text = clean_text(text) seq = tokenizer.texts_to_sequences([cleaned_text]) padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post') prediction = model.predict(padded, verbose=0)[0] labels = ['Negatif', 'Netral', 'Positif'] label_idx = np.argmax(prediction) label = labels[label_idx] confidence = prediction[label_idx] * 100 return label, confidence, prediction, cleaned_text