TIF_E41211115_lstm-quiz-gen.../combine_nlp_lstm.py

import numpy as np
import tensorflow as tf
import spacy
import nltk
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input
from tensorflow.keras.models import Model
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

# === LOAD NLP MODEL ===
nlp = spacy.load("en_core_web_sm")


# === PREPROCESSING FUNCTION ===
def preprocess_text(text):
    """Melakukan Named Entity Recognition dan Dependency Parsing"""
    doc = nlp(text)
    entities = {ent.text: ent.label_ for ent in doc.ents}

    # Print hasil Named Entity Recognition
    print("\nNamed Entities Detected:")
    for ent, label in entities.items():
        print(f"{ent}: {label}")

    return entities


# === LSTM MODEL (SEQUENCE-TO-SEQUENCE) ===
embedding_dim = 128
lstm_units = 256
vocab_size = 5000  # Sesuaikan dengan dataset

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
    decoder_embedding, initial_state=[state_h, state_c]
)
decoder_dense = Dense(vocab_size, activation="softmax")
output = decoder_dense(decoder_outputs)

# Model
lstm_model = Model([encoder_inputs, decoder_inputs], output)
lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")


# === FUNCTION TO GENERATE QUESTION USING LSTM ===
def generate_question_lstm(text, model, tokenizer, max_len=20):
    """Generate soal menggunakan LSTM"""
    input_seq = tokenizer.texts_to_sequences([text])
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len)

    generated_question = []
    start_token = tokenizer.word_index.get("<start>", 1)
    end_token = tokenizer.word_index.get("<end>", 2)

    next_word = start_token
    while next_word != end_token and len(generated_question) < max_len:
        output = model.predict([input_seq, np.array([next_word])])
        next_word = np.argmax(output[0, -1, :])
        generated_question.append(tokenizer.index_word.get(next_word, ""))

    return " ".join(generated_question)


# === T5 TRANSFORMER MODEL (VERSI TENSORFLOW) ===
t5_model_name = "t5-small"
t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name)
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)


def generate_question_t5(text):
    """Generate soal menggunakan T5 Transformer versi TensorFlow"""
    input_text = "generate question: " + text
    input_ids = t5_tokenizer.encode(
        input_text, return_tensors="tf"
    )  # Gunakan TensorFlow
    output = t5_model.generate(input_ids, max_length=50)
    return t5_tokenizer.decode(output[0], skip_special_tokens=True)


# === BLEU SCORE EVALUATION ===
def evaluate_bleu(reference, candidate):
    """Menghitung BLEU Score antara pertanyaan asli dan yang dihasilkan"""
    score = sentence_bleu([reference.split()], candidate.split())
    print(f"BLEU Score: {score:.4f}")
    return score


# === MAIN EXECUTION ===
if __name__ == "__main__":
    paragraph = "Albert Einstein mengembangkan teori relativitas pada tahun 1905."

    # Preprocessing
    print("\n🛠️ Preprocessing text...")
    entities = preprocess_text(paragraph)

    # Generate soal menggunakan LSTM
    print("\n🔵 Generating Question using LSTM (Dummy Model)...")
    dummy_tokenizer = {
        "texts_to_sequences": lambda x: [[1, 2, 3, 4]],
        "index_word": {1: "apa", 2: "siapa", 3: "di", 4: "tahun"},
    }
    question_lstm = generate_question_lstm(paragraph, lstm_model, dummy_tokenizer)
    print(f"LSTM Generated Question: {question_lstm}")

    # Generate soal menggunakan T5
    print("\n🟢 Generating Question using T5 Transformer...")
    question_t5 = generate_question_t5(paragraph)
    print(f"T5 Generated Question: {question_t5}")

    # Evaluasi BLEU Score
    reference_question = "Kapan teori relativitas dikembangkan?"
    print("\n📊 Evaluating BLEU Score...")
    evaluate_bleu(reference_question, question_t5)