TIF_E41211115_lstm-quiz-gen.../old/test.py

import numpy as np
import json
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences


def preprocess_text(text):
    """ Simple text preprocessing (should match training preprocessing) """
    text = text.lower()
    return text


def generate_question(paragraph, tokenizer, model, max_length):
    """ Generate a question based on the input paragraph """
    # 1) Preprocess paragraph
    paragraph = preprocess_text(paragraph)

    # 2) Tokenize
    seq = tokenizer.texts_to_sequences([paragraph])
    padded = pad_sequences(seq, maxlen=max_length, padding="post")

    # 3) Get model predictions
    prediction = model.predict(padded)

    # 4) Get the most likely word indices
    predicted_indices = np.argmax(prediction, axis=-1)[0]

    # 5) Convert indices to words
    predicted_words = []
    for idx in predicted_indices:
        if idx == 0:  # Assuming 0 is padding or unknown token
            break
        word = tokenizer.index_word.get(idx, "")
        predicted_words.append(word)

    # 6) Form the final question
    predicted_question = " ".join(predicted_words)
    if not predicted_question.endswith("?"):
        predicted_question += "?"

    return predicted_question


def determine_question_type(paragraph):
    """ Simple rule-based method to determine question type """
    if "benar atau salah" in paragraph.lower() or "adalah" in paragraph.lower():
        return "true_false"
    elif "berapa" in paragraph.lower() or "mana" in paragraph.lower():
        return "multiple_choice"
    else:
        return "fill_in_the_blank"


def extract_possible_answer(paragraph):
    """ Basic heuristic to extract an answer (first proper noun or keyword) """
    words = paragraph.split()
    if len(words) > 2:
        return words[0] + " " + words[1]  # Return first two words as a basic approach
    return words[0]


def generate_options(question_type, answer):
    """ Generate dummy options for multiple-choice questions """
    if question_type == "multiple_choice":
        return [answer, "Option B", "Option C", "Option D"]
    elif question_type == "true_false":
        return ["True", "False"]
    else:
        return ["-"]  # Placeholder for fill-in-the-blank


# Load the trained model and tokenizer
model = load_model("lstm_question_generator.keras")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

max_length = 50  # Ensure this is the same as in training

# Sample paragraph input
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum"

# Generate question
generated_question = generate_question(paragraph_input, tokenizer, model, max_length)

# Determine question type and answer
question_type = determine_question_type(paragraph_input)
answer = extract_possible_answer(paragraph_input)
options = generate_options(question_type, answer)

# Construct JSON output
output = {
    "paragraph": paragraph_input,
    "question_type": question_type,
    "question": generated_question,
    "answer": answer,
    "options": "|".join(options)  # Match dataset format
}

# Print JSON result
print(json.dumps(output, indent=4, ensure_ascii=False))