TIF_E41211115_lstm-quiz-gen.../combine_nlp_lstm.py

123 lines
4.2 KiB
Python

import numpy as np
import tensorflow as tf
import spacy
import nltk
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input
from tensorflow.keras.models import Model
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
# === LOAD NLP MODEL ===
nlp = spacy.load("en_core_web_sm")
# === PREPROCESSING FUNCTION ===
def preprocess_text(text):
"""Melakukan Named Entity Recognition dan Dependency Parsing"""
doc = nlp(text)
entities = {ent.text: ent.label_ for ent in doc.ents}
# Print hasil Named Entity Recognition
print("\nNamed Entities Detected:")
for ent, label in entities.items():
print(f"{ent}: {label}")
return entities
# === LSTM MODEL (SEQUENCE-TO-SEQUENCE) ===
embedding_dim = 128
lstm_units = 256
vocab_size = 5000 # Sesuaikan dengan dataset
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
decoder_embedding, initial_state=[state_h, state_c]
)
decoder_dense = Dense(vocab_size, activation="softmax")
output = decoder_dense(decoder_outputs)
# Model
lstm_model = Model([encoder_inputs, decoder_inputs], output)
lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# === FUNCTION TO GENERATE QUESTION USING LSTM ===
def generate_question_lstm(text, model, tokenizer, max_len=20):
"""Generate soal menggunakan LSTM"""
input_seq = tokenizer.texts_to_sequences([text])
input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len)
generated_question = []
start_token = tokenizer.word_index.get("<start>", 1)
end_token = tokenizer.word_index.get("<end>", 2)
next_word = start_token
while next_word != end_token and len(generated_question) < max_len:
output = model.predict([input_seq, np.array([next_word])])
next_word = np.argmax(output[0, -1, :])
generated_question.append(tokenizer.index_word.get(next_word, ""))
return " ".join(generated_question)
# === T5 TRANSFORMER MODEL (VERSI TENSORFLOW) ===
t5_model_name = "t5-small"
t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name)
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
def generate_question_t5(text):
"""Generate soal menggunakan T5 Transformer versi TensorFlow"""
input_text = "generate question: " + text
input_ids = t5_tokenizer.encode(
input_text, return_tensors="tf"
) # Gunakan TensorFlow
output = t5_model.generate(input_ids, max_length=50)
return t5_tokenizer.decode(output[0], skip_special_tokens=True)
# === BLEU SCORE EVALUATION ===
def evaluate_bleu(reference, candidate):
"""Menghitung BLEU Score antara pertanyaan asli dan yang dihasilkan"""
score = sentence_bleu([reference.split()], candidate.split())
print(f"BLEU Score: {score:.4f}")
return score
# === MAIN EXECUTION ===
if __name__ == "__main__":
paragraph = "Albert Einstein mengembangkan teori relativitas pada tahun 1905."
# Preprocessing
print("\n🛠️ Preprocessing text...")
entities = preprocess_text(paragraph)
# Generate soal menggunakan LSTM
print("\n🔵 Generating Question using LSTM (Dummy Model)...")
dummy_tokenizer = {
"texts_to_sequences": lambda x: [[1, 2, 3, 4]],
"index_word": {1: "apa", 2: "siapa", 3: "di", 4: "tahun"},
}
question_lstm = generate_question_lstm(paragraph, lstm_model, dummy_tokenizer)
print(f"LSTM Generated Question: {question_lstm}")
# Generate soal menggunakan T5
print("\n🟢 Generating Question using T5 Transformer...")
question_t5 = generate_question_t5(paragraph)
print(f"T5 Generated Question: {question_t5}")
# Evaluasi BLEU Score
reference_question = "Kapan teori relativitas dikembangkan?"
print("\n📊 Evaluating BLEU Score...")
evaluate_bleu(reference_question, question_t5)