123 lines
4.2 KiB
Python
123 lines
4.2 KiB
Python
import numpy as np
|
|
import tensorflow as tf
|
|
import spacy
|
|
import nltk
|
|
from nltk.translate.bleu_score import sentence_bleu
|
|
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input
|
|
from tensorflow.keras.models import Model
|
|
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
|
|
|
|
# === LOAD NLP MODEL ===
|
|
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
|
# === PREPROCESSING FUNCTION ===
|
|
def preprocess_text(text):
|
|
"""Melakukan Named Entity Recognition dan Dependency Parsing"""
|
|
doc = nlp(text)
|
|
entities = {ent.text: ent.label_ for ent in doc.ents}
|
|
|
|
# Print hasil Named Entity Recognition
|
|
print("\nNamed Entities Detected:")
|
|
for ent, label in entities.items():
|
|
print(f"{ent}: {label}")
|
|
|
|
return entities
|
|
|
|
|
|
# === LSTM MODEL (SEQUENCE-TO-SEQUENCE) ===
|
|
embedding_dim = 128
|
|
lstm_units = 256
|
|
vocab_size = 5000 # Sesuaikan dengan dataset
|
|
|
|
# Encoder
|
|
encoder_inputs = Input(shape=(None,))
|
|
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
|
|
encoder_lstm = LSTM(lstm_units, return_state=True)
|
|
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
|
|
|
|
# Decoder
|
|
decoder_inputs = Input(shape=(None,))
|
|
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
|
|
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
|
|
decoder_outputs, _, _ = decoder_lstm(
|
|
decoder_embedding, initial_state=[state_h, state_c]
|
|
)
|
|
decoder_dense = Dense(vocab_size, activation="softmax")
|
|
output = decoder_dense(decoder_outputs)
|
|
|
|
# Model
|
|
lstm_model = Model([encoder_inputs, decoder_inputs], output)
|
|
lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
|
|
|
|
|
|
# === FUNCTION TO GENERATE QUESTION USING LSTM ===
|
|
def generate_question_lstm(text, model, tokenizer, max_len=20):
|
|
"""Generate soal menggunakan LSTM"""
|
|
input_seq = tokenizer.texts_to_sequences([text])
|
|
input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len)
|
|
|
|
generated_question = []
|
|
start_token = tokenizer.word_index.get("<start>", 1)
|
|
end_token = tokenizer.word_index.get("<end>", 2)
|
|
|
|
next_word = start_token
|
|
while next_word != end_token and len(generated_question) < max_len:
|
|
output = model.predict([input_seq, np.array([next_word])])
|
|
next_word = np.argmax(output[0, -1, :])
|
|
generated_question.append(tokenizer.index_word.get(next_word, ""))
|
|
|
|
return " ".join(generated_question)
|
|
|
|
|
|
# === T5 TRANSFORMER MODEL (VERSI TENSORFLOW) ===
|
|
t5_model_name = "t5-small"
|
|
t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name)
|
|
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
|
|
|
|
|
|
def generate_question_t5(text):
|
|
"""Generate soal menggunakan T5 Transformer versi TensorFlow"""
|
|
input_text = "generate question: " + text
|
|
input_ids = t5_tokenizer.encode(
|
|
input_text, return_tensors="tf"
|
|
) # Gunakan TensorFlow
|
|
output = t5_model.generate(input_ids, max_length=50)
|
|
return t5_tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
|
|
|
# === BLEU SCORE EVALUATION ===
|
|
def evaluate_bleu(reference, candidate):
|
|
"""Menghitung BLEU Score antara pertanyaan asli dan yang dihasilkan"""
|
|
score = sentence_bleu([reference.split()], candidate.split())
|
|
print(f"BLEU Score: {score:.4f}")
|
|
return score
|
|
|
|
|
|
# === MAIN EXECUTION ===
|
|
if __name__ == "__main__":
|
|
paragraph = "Albert Einstein mengembangkan teori relativitas pada tahun 1905."
|
|
|
|
# Preprocessing
|
|
print("\n🛠️ Preprocessing text...")
|
|
entities = preprocess_text(paragraph)
|
|
|
|
# Generate soal menggunakan LSTM
|
|
print("\n🔵 Generating Question using LSTM (Dummy Model)...")
|
|
dummy_tokenizer = {
|
|
"texts_to_sequences": lambda x: [[1, 2, 3, 4]],
|
|
"index_word": {1: "apa", 2: "siapa", 3: "di", 4: "tahun"},
|
|
}
|
|
question_lstm = generate_question_lstm(paragraph, lstm_model, dummy_tokenizer)
|
|
print(f"LSTM Generated Question: {question_lstm}")
|
|
|
|
# Generate soal menggunakan T5
|
|
print("\n🟢 Generating Question using T5 Transformer...")
|
|
question_t5 = generate_question_t5(paragraph)
|
|
print(f"T5 Generated Question: {question_t5}")
|
|
|
|
# Evaluasi BLEU Score
|
|
reference_question = "Kapan teori relativitas dikembangkan?"
|
|
print("\n📊 Evaluating BLEU Score...")
|
|
evaluate_bleu(reference_question, question_t5)
|