TIF_E41211115_lstm-quiz-gen.../uji.py

164 lines
5.4 KiB
Python

import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import random
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Ensure NLTK resources are available
nltk.download("punkt")
nltk.download("stopwords")
class QuestionGenerator:
def __init__(
self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl"
):
"""
Initializes the QuestionGenerator by loading the trained model and tokenizer.
"""
# Load trained model
self.model = tf.keras.models.load_model(model_path)
# Load tokenizer
with open(tokenizer_path, "rb") as handle:
self.tokenizer = pickle.load(handle)
# Define question type mapping
self.question_type_dict = {
0: "fill_in_the_blank",
1: "true_false",
2: "multiple_choice",
}
# Load Indonesian stopwords
self.stop_words = set(stopwords.words("indonesian"))
# Custom word normalization dictionary
self.normalization_dict = {
"yg": "yang",
"gokil": "kocak",
"kalo": "kalau",
"gue": "saya",
"elo": "kamu",
"nih": "ini",
"trs": "terus",
"tdk": "tidak",
"gmna": "bagaimana",
"tp": "tapi",
"jd": "jadi",
"aja": "saja",
"krn": "karena",
"blm": "belum",
"dgn": "dengan",
"skrg": "sekarang",
"msh": "masih",
"lg": "lagi",
"sy": "saya",
"sm": "sama",
"bgt": "banget",
"dr": "dari",
"kpn": "kapan",
"hrs": "harus",
"cm": "cuma",
"sbnrnya": "sebenarnya",
}
def preprocess_text(self, text):
"""
Preprocesses the input text by:
- Converting to lowercase
- Removing punctuation
- Tokenizing
- Normalizing words
- Removing stopwords
"""
text = text.lower()
text = text.translate(
str.maketrans("", "", string.punctuation)
) # Remove punctuation
text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
tokens = word_tokenize(text) # Tokenization
tokens = [
self.normalization_dict.get(word, word) for word in tokens
] # Normalize words
tokens = [
word for word in tokens if word not in self.stop_words
] # Remove stopwords
return " ".join(tokens)
def sequence_to_text(self, sequence):
"""
Converts a tokenized sequence back into readable text.
"""
return " ".join(
[
self.tokenizer.index_word.get(idx, "<OOV>")
for idx in sequence
if idx != 0
]
)
def generate_qa_from_paragraph(self, paragraph):
"""
Generates a question, answer, and question type from the given paragraph.
If it's a multiple-choice question, it also returns answer options.
"""
# Preprocess the input paragraph
processed_paragraph = self.preprocess_text(paragraph)
# Convert text to sequence
input_seq = self.tokenizer.texts_to_sequences([processed_paragraph])
input_seq = pad_sequences(input_seq, maxlen=100, padding="post")
# Predict question, answer, and type
pred_question, pred_answer, pred_qtype = self.model.predict(
[input_seq, input_seq]
)
# Decode predictions
generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1))
generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1))
question_type_index = np.argmax(pred_qtype[0])
generated_qtype = self.question_type_dict[question_type_index]
# Handle multiple-choice options
options = None
if generated_qtype == "multiple_choice":
words = processed_paragraph.split()
random.shuffle(words)
distractors = [
word for word in words if word.lower() != generated_answer.lower()
]
options = [generated_answer] + distractors[:3]
random.shuffle(options) # Shuffle options
# Return the generated data
return {
"generated_question": generated_question,
"generated_answer": generated_answer,
"question_type": generated_qtype,
"options": options if generated_qtype == "multiple_choice" else None,
}
# Initialize the question generator
qg = QuestionGenerator()
# Example input paragraph
sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur."
# Generate question, answer, and type
generated_result = qg.generate_qa_from_paragraph(sample_paragraph)
# Print output
print("Generated Question:", generated_result["generated_question"])
print("Generated Answer:", generated_result["generated_answer"])
print("Question Type:", generated_result["question_type"])
if generated_result["options"]:
print("Options:", generated_result["options"])