import numpy as np import json import pickle from tensorflow.keras.models import load_model from tensorflow.keras.preprocessing.sequence import pad_sequences def preprocess_text(text): """ Simple text preprocessing (should match training preprocessing) """ text = text.lower() return text def generate_question(paragraph, tokenizer, model, max_length): """ Generate a question based on the input paragraph """ # 1) Preprocess paragraph paragraph = preprocess_text(paragraph) # 2) Tokenize seq = tokenizer.texts_to_sequences([paragraph]) padded = pad_sequences(seq, maxlen=max_length, padding="post") # 3) Get model predictions prediction = model.predict(padded) # 4) Get the most likely word indices predicted_indices = np.argmax(prediction, axis=-1)[0] # 5) Convert indices to words predicted_words = [] for idx in predicted_indices: if idx == 0: # Assuming 0 is padding or unknown token break word = tokenizer.index_word.get(idx, "") predicted_words.append(word) # 6) Form the final question predicted_question = " ".join(predicted_words) if not predicted_question.endswith("?"): predicted_question += "?" return predicted_question def determine_question_type(paragraph): """ Simple rule-based method to determine question type """ if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower(): return "true_false" elif "berapa" in paragraph.lower() or "mana" in paragraph.lower(): return "multiple_choice" else: return "fill_in_the_blank" def extract_possible_answer(paragraph): """ Basic heuristic to extract an answer (first proper noun or keyword) """ words = paragraph.split() if len(words) > 2: return words[0] + " " + words[1] # Return first two words as a basic approach return words[0] def generate_options(question_type, answer): """ Generate dummy options for multiple-choice questions """ if question_type == "multiple_choice": return [answer, "Option B", "Option C", "Option D"] elif question_type == "true_false": return ["True", "False"] else: return ["-"] # Placeholder for fill-in-the-blank # Load the trained model and tokenizer model = load_model("lstm_question_generator.keras") with open("tokenizer.pkl", "rb") as f: tokenizer = pickle.load(f) max_length = 50 # Ensure this is the same as in training # Sample paragraph input paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum" # Generate question generated_question = generate_question(paragraph_input, tokenizer, model, max_length) # Determine question type and answer question_type = determine_question_type(paragraph_input) answer = extract_possible_answer(paragraph_input) options = generate_options(question_type, answer) # Construct JSON output output = { "paragraph": paragraph_input, "question_type": question_type, "question": generated_question, "answer": answer, "options": "|".join(options) # Match dataset format } # Print JSON result print(json.dumps(output, indent=4, ensure_ascii=False))