import numpy as np import tensorflow as tf import spacy import nltk from nltk.translate.bleu_score import sentence_bleu from tensorflow.keras.layers import LSTM, Embedding, Dense, Input from tensorflow.keras.models import Model from transformers import TFT5ForConditionalGeneration, T5Tokenizer # === LOAD NLP MODEL === nlp = spacy.load("en_core_web_sm") # === PREPROCESSING FUNCTION === def preprocess_text(text): """Melakukan Named Entity Recognition dan Dependency Parsing""" doc = nlp(text) entities = {ent.text: ent.label_ for ent in doc.ents} # Print hasil Named Entity Recognition print("\nNamed Entities Detected:") for ent, label in entities.items(): print(f"{ent}: {label}") return entities # === LSTM MODEL (SEQUENCE-TO-SEQUENCE) === embedding_dim = 128 lstm_units = 256 vocab_size = 5000 # Sesuaikan dengan dataset # Encoder encoder_inputs = Input(shape=(None,)) encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs) encoder_lstm = LSTM(lstm_units, return_state=True) encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding) # Decoder decoder_inputs = Input(shape=(None,)) decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs) decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True) decoder_outputs, _, _ = decoder_lstm( decoder_embedding, initial_state=[state_h, state_c] ) decoder_dense = Dense(vocab_size, activation="softmax") output = decoder_dense(decoder_outputs) # Model lstm_model = Model([encoder_inputs, decoder_inputs], output) lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy") # === FUNCTION TO GENERATE QUESTION USING LSTM === def generate_question_lstm(text, model, tokenizer, max_len=20): """Generate soal menggunakan LSTM""" input_seq = tokenizer.texts_to_sequences([text]) input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len) generated_question = [] start_token = tokenizer.word_index.get("", 1) end_token = tokenizer.word_index.get("", 2) next_word = start_token while next_word != end_token and len(generated_question) < max_len: output = model.predict([input_seq, np.array([next_word])]) next_word = np.argmax(output[0, -1, :]) generated_question.append(tokenizer.index_word.get(next_word, "")) return " ".join(generated_question) # === T5 TRANSFORMER MODEL (VERSI TENSORFLOW) === t5_model_name = "t5-small" t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name) t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name) def generate_question_t5(text): """Generate soal menggunakan T5 Transformer versi TensorFlow""" input_text = "generate question: " + text input_ids = t5_tokenizer.encode( input_text, return_tensors="tf" ) # Gunakan TensorFlow output = t5_model.generate(input_ids, max_length=50) return t5_tokenizer.decode(output[0], skip_special_tokens=True) # === BLEU SCORE EVALUATION === def evaluate_bleu(reference, candidate): """Menghitung BLEU Score antara pertanyaan asli dan yang dihasilkan""" score = sentence_bleu([reference.split()], candidate.split()) print(f"BLEU Score: {score:.4f}") return score # === MAIN EXECUTION === if __name__ == "__main__": paragraph = "Albert Einstein mengembangkan teori relativitas pada tahun 1905." # Preprocessing print("\n🛠️ Preprocessing text...") entities = preprocess_text(paragraph) # Generate soal menggunakan LSTM print("\n🔵 Generating Question using LSTM (Dummy Model)...") dummy_tokenizer = { "texts_to_sequences": lambda x: [[1, 2, 3, 4]], "index_word": {1: "apa", 2: "siapa", 3: "di", 4: "tahun"}, } question_lstm = generate_question_lstm(paragraph, lstm_model, dummy_tokenizer) print(f"LSTM Generated Question: {question_lstm}") # Generate soal menggunakan T5 print("\n🟢 Generating Question using T5 Transformer...") question_t5 = generate_question_t5(paragraph) print(f"T5 Generated Question: {question_t5}") # Evaluasi BLEU Score reference_question = "Kapan teori relativitas dikembangkan?" print("\n📊 Evaluating BLEU Score...") evaluate_bleu(reference_question, question_t5)