103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
import numpy as np
|
|
import json
|
|
import pickle
|
|
from tensorflow.keras.models import load_model
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
|
|
|
|
def preprocess_text(text):
|
|
""" Simple text preprocessing (should match training preprocessing) """
|
|
text = text.lower()
|
|
return text
|
|
|
|
|
|
def generate_question(paragraph, tokenizer, model, max_length):
|
|
""" Generate a question based on the input paragraph """
|
|
# 1) Preprocess paragraph
|
|
paragraph = preprocess_text(paragraph)
|
|
|
|
# 2) Tokenize
|
|
seq = tokenizer.texts_to_sequences([paragraph])
|
|
padded = pad_sequences(seq, maxlen=max_length, padding="post")
|
|
|
|
# 3) Get model predictions
|
|
prediction = model.predict(padded)
|
|
|
|
# 4) Get the most likely word indices
|
|
predicted_indices = np.argmax(prediction, axis=-1)[0]
|
|
|
|
# 5) Convert indices to words
|
|
predicted_words = []
|
|
for idx in predicted_indices:
|
|
if idx == 0: # Assuming 0 is padding or unknown token
|
|
break
|
|
word = tokenizer.index_word.get(idx, "")
|
|
predicted_words.append(word)
|
|
|
|
# 6) Form the final question
|
|
predicted_question = " ".join(predicted_words)
|
|
if not predicted_question.endswith("?"):
|
|
predicted_question += "?"
|
|
|
|
return predicted_question
|
|
|
|
|
|
def determine_question_type(paragraph):
|
|
""" Simple rule-based method to determine question type """
|
|
if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower():
|
|
return "true_false"
|
|
elif "berapa" in paragraph.lower() or "mana" in paragraph.lower():
|
|
return "multiple_choice"
|
|
else:
|
|
return "fill_in_the_blank"
|
|
|
|
|
|
def extract_possible_answer(paragraph):
|
|
""" Basic heuristic to extract an answer (first proper noun or keyword) """
|
|
words = paragraph.split()
|
|
if len(words) > 2:
|
|
return words[0] + " " + words[1] # Return first two words as a basic approach
|
|
return words[0]
|
|
|
|
|
|
def generate_options(question_type, answer):
|
|
""" Generate dummy options for multiple-choice questions """
|
|
if question_type == "multiple_choice":
|
|
return [answer, "Option B", "Option C", "Option D"]
|
|
elif question_type == "true_false":
|
|
return ["True", "False"]
|
|
else:
|
|
return ["-"] # Placeholder for fill-in-the-blank
|
|
|
|
|
|
# Load the trained model and tokenizer
|
|
model = load_model("lstm_question_generator.keras")
|
|
|
|
with open("tokenizer.pkl", "rb") as f:
|
|
tokenizer = pickle.load(f)
|
|
|
|
max_length = 50 # Ensure this is the same as in training
|
|
|
|
# Sample paragraph input
|
|
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum"
|
|
|
|
# Generate question
|
|
generated_question = generate_question(paragraph_input, tokenizer, model, max_length)
|
|
|
|
# Determine question type and answer
|
|
question_type = determine_question_type(paragraph_input)
|
|
answer = extract_possible_answer(paragraph_input)
|
|
options = generate_options(question_type, answer)
|
|
|
|
# Construct JSON output
|
|
output = {
|
|
"paragraph": paragraph_input,
|
|
"question_type": question_type,
|
|
"question": generated_question,
|
|
"answer": answer,
|
|
"options": "|".join(options) # Match dataset format
|
|
}
|
|
|
|
# Print JSON result
|
|
print(json.dumps(output, indent=4, ensure_ascii=False))
|