164 lines
5.7 KiB
Python
164 lines
5.7 KiB
Python
# import numpy as np
|
|
# import pickle
|
|
# import tensorflow as tf
|
|
# from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
# import nltk
|
|
# import random
|
|
# import string
|
|
# import re
|
|
# from nltk.tokenize import word_tokenize
|
|
# from nltk.corpus import stopwords
|
|
|
|
# # Ensure NLTK resources are available
|
|
# nltk.download("punkt")
|
|
# nltk.download("stopwords")
|
|
|
|
|
|
# class QuestionGenerator:
|
|
# def __init__(
|
|
# self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl"
|
|
# ):
|
|
# """
|
|
# Initializes the QuestionGenerator by loading the trained model and tokenizer.
|
|
# """
|
|
# # Load trained model
|
|
# self.model = tf.keras.models.load_model(model_path)
|
|
|
|
# # Load tokenizer
|
|
# with open(tokenizer_path, "rb") as handle:
|
|
# self.tokenizer = pickle.load(handle)
|
|
|
|
# # Define question type mapping
|
|
# self.question_type_dict = {
|
|
# 0: "fill_in_the_blank",
|
|
# 1: "true_false",
|
|
# 2: "multiple_choice",
|
|
# }
|
|
|
|
# # Load Indonesian stopwords
|
|
# self.stop_words = set(stopwords.words("indonesian"))
|
|
|
|
# # Custom word normalization dictionary
|
|
# self.normalization_dict = {
|
|
# "yg": "yang",
|
|
# "gokil": "kocak",
|
|
# "kalo": "kalau",
|
|
# "gue": "saya",
|
|
# "elo": "kamu",
|
|
# "nih": "ini",
|
|
# "trs": "terus",
|
|
# "tdk": "tidak",
|
|
# "gmna": "bagaimana",
|
|
# "tp": "tapi",
|
|
# "jd": "jadi",
|
|
# "aja": "saja",
|
|
# "krn": "karena",
|
|
# "blm": "belum",
|
|
# "dgn": "dengan",
|
|
# "skrg": "sekarang",
|
|
# "msh": "masih",
|
|
# "lg": "lagi",
|
|
# "sy": "saya",
|
|
# "sm": "sama",
|
|
# "bgt": "banget",
|
|
# "dr": "dari",
|
|
# "kpn": "kapan",
|
|
# "hrs": "harus",
|
|
# "cm": "cuma",
|
|
# "sbnrnya": "sebenarnya",
|
|
# }
|
|
|
|
# def preprocess_text(self, text):
|
|
# """
|
|
# Preprocesses the input text by:
|
|
# - Converting to lowercase
|
|
# - Removing punctuation
|
|
# - Tokenizing
|
|
# - Normalizing words
|
|
# - Removing stopwords
|
|
# """
|
|
# text = text.lower()
|
|
# text = text.translate(
|
|
# str.maketrans("", "", string.punctuation)
|
|
# ) # Remove punctuation
|
|
# text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
|
|
# tokens = word_tokenize(text) # Tokenization
|
|
# tokens = [
|
|
# self.normalization_dict.get(word, word) for word in tokens
|
|
# ] # Normalize words
|
|
# tokens = [
|
|
# word for word in tokens if word not in self.stop_words
|
|
# ] # Remove stopwords
|
|
# return " ".join(tokens)
|
|
|
|
# def sequence_to_text(self, sequence):
|
|
# """
|
|
# Converts a tokenized sequence back into readable text.
|
|
# """
|
|
# return " ".join(
|
|
# [
|
|
# self.tokenizer.index_word.get(idx, "<OOV>")
|
|
# for idx in sequence
|
|
# if idx != 0
|
|
# ]
|
|
# )
|
|
|
|
# def generate_qa_from_paragraph(self, paragraph):
|
|
# """
|
|
# Generates a question, answer, and question type from the given paragraph.
|
|
# If it's a multiple-choice question, it also returns answer options.
|
|
# """
|
|
# # Preprocess the input paragraph
|
|
# processed_paragraph = self.preprocess_text(paragraph)
|
|
|
|
# # Convert text to sequence
|
|
# input_seq = self.tokenizer.texts_to_sequences([processed_paragraph])
|
|
# input_seq = pad_sequences(input_seq, maxlen=100, padding="post")
|
|
|
|
# # Predict question, answer, and type
|
|
# pred_question, pred_answer, pred_qtype = self.model.predict(
|
|
# [input_seq, input_seq]
|
|
# )
|
|
|
|
# # Decode predictions
|
|
# generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1))
|
|
# generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1))
|
|
# question_type_index = np.argmax(pred_qtype[0])
|
|
# generated_qtype = self.question_type_dict[question_type_index]
|
|
|
|
# # Handle multiple-choice options
|
|
# options = None
|
|
# if generated_qtype == "multiple_choice":
|
|
# words = processed_paragraph.split()
|
|
# random.shuffle(words)
|
|
# distractors = [
|
|
# word for word in words if word.lower() != generated_answer.lower()
|
|
# ]
|
|
# options = [generated_answer] + distractors[:3]
|
|
# random.shuffle(options) # Shuffle options
|
|
|
|
# # Return the generated data
|
|
# return {
|
|
# "generated_question": generated_question,
|
|
# "generated_answer": generated_answer,
|
|
# "question_type": generated_qtype,
|
|
# "options": options if generated_qtype == "multiple_choice" else None,
|
|
# }
|
|
|
|
|
|
# # Initialize the question generator
|
|
# qg = QuestionGenerator()
|
|
|
|
# # Example input paragraph
|
|
# sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur."
|
|
|
|
# # Generate question, answer, and type
|
|
# generated_result = qg.generate_qa_from_paragraph(sample_paragraph)
|
|
|
|
# # Print output
|
|
# print("Generated Question:", generated_result["generated_question"])
|
|
# print("Generated Answer:", generated_result["generated_answer"])
|
|
# print("Question Type:", generated_result["question_type"])
|
|
# if generated_result["options"]:
|
|
# print("Options:", generated_result["options"])
|