import numpy as np import pickle import tensorflow as tf from tensorflow.keras.preprocessing.sequence import pad_sequences import nltk import random import string import re from nltk.tokenize import word_tokenize from nltk.corpus import stopwords # Ensure NLTK resources are available nltk.download("punkt") nltk.download("stopwords") class QuestionGenerator: def __init__( self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl" ): """ Initializes the QuestionGenerator by loading the trained model and tokenizer. """ # Load trained model self.model = tf.keras.models.load_model(model_path) # Load tokenizer with open(tokenizer_path, "rb") as handle: self.tokenizer = pickle.load(handle) # Define question type mapping self.question_type_dict = { 0: "fill_in_the_blank", 1: "true_false", 2: "multiple_choice", } # Load Indonesian stopwords self.stop_words = set(stopwords.words("indonesian")) # Custom word normalization dictionary self.normalization_dict = { "yg": "yang", "gokil": "kocak", "kalo": "kalau", "gue": "saya", "elo": "kamu", "nih": "ini", "trs": "terus", "tdk": "tidak", "gmna": "bagaimana", "tp": "tapi", "jd": "jadi", "aja": "saja", "krn": "karena", "blm": "belum", "dgn": "dengan", "skrg": "sekarang", "msh": "masih", "lg": "lagi", "sy": "saya", "sm": "sama", "bgt": "banget", "dr": "dari", "kpn": "kapan", "hrs": "harus", "cm": "cuma", "sbnrnya": "sebenarnya", } def preprocess_text(self, text): """ Preprocesses the input text by: - Converting to lowercase - Removing punctuation - Tokenizing - Normalizing words - Removing stopwords """ text = text.lower() text = text.translate( str.maketrans("", "", string.punctuation) ) # Remove punctuation text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces tokens = word_tokenize(text) # Tokenization tokens = [ self.normalization_dict.get(word, word) for word in tokens ] # Normalize words tokens = [ word for word in tokens if word not in self.stop_words ] # Remove stopwords return " ".join(tokens) def sequence_to_text(self, sequence): """ Converts a tokenized sequence back into readable text. """ return " ".join( [ self.tokenizer.index_word.get(idx, "") for idx in sequence if idx != 0 ] ) def generate_qa_from_paragraph(self, paragraph): """ Generates a question, answer, and question type from the given paragraph. If it's a multiple-choice question, it also returns answer options. """ # Preprocess the input paragraph processed_paragraph = self.preprocess_text(paragraph) # Convert text to sequence input_seq = self.tokenizer.texts_to_sequences([processed_paragraph]) input_seq = pad_sequences(input_seq, maxlen=100, padding="post") # Predict question, answer, and type pred_question, pred_answer, pred_qtype = self.model.predict( [input_seq, input_seq] ) # Decode predictions generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1)) generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1)) question_type_index = np.argmax(pred_qtype[0]) generated_qtype = self.question_type_dict[question_type_index] # Handle multiple-choice options options = None if generated_qtype == "multiple_choice": words = processed_paragraph.split() random.shuffle(words) distractors = [ word for word in words if word.lower() != generated_answer.lower() ] options = [generated_answer] + distractors[:3] random.shuffle(options) # Shuffle options # Return the generated data return { "generated_question": generated_question, "generated_answer": generated_answer, "question_type": generated_qtype, "options": options if generated_qtype == "multiple_choice" else None, } # Initialize the question generator qg = QuestionGenerator() # Example input paragraph sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur." # Generate question, answer, and type generated_result = qg.generate_qa_from_paragraph(sample_paragraph) # Print output print("Generated Question:", generated_result["generated_question"]) print("Generated Answer:", generated_result["generated_answer"]) print("Question Type:", generated_result["question_type"]) if generated_result["options"]: print("Options:", generated_result["options"])