import numpy as np import json import tensorflow as tf from tensorflow.keras.preprocessing.text import tokenizer_from_json from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import load_model import re class QuestionPredictionModel: def __init__(self, model_path, tokenizer_path): """ Initialize question prediction model with pre-trained model and tokenizers """ # Load model self.model = load_model(model_path) # Load tokenizers with open(tokenizer_path, "r") as f: tokenizer_data = json.load(f) # Reconstruct tokenizers self.word_tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"]) self.ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"]) self.srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"]) self.q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"]) # Get max lengths self.max_context_len = tokenizer_data["max_context_len"] self.max_question_len = tokenizer_data["max_question_len"] self.max_token_len = tokenizer_data["max_token_len"] # Get vocabulary sizes self.vocab_size = len(self.word_tokenizer.word_index) + 1 self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1 def preprocess_text(self, text): """Basic text preprocessing""" text = text.lower() text = re.sub(r"\s+", " ", text).strip() return text def predict_question(self, context, tokens, ner, srl, q_type): """Prediksi pertanyaan berdasarkan konteks dan fitur lainnya""" # Preprocess context = self.preprocess_text(context) # Convert to sequences context_seq = self.word_tokenizer.texts_to_sequences([context])[0] token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0] ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0] srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0] # Pad sequences context_padded = pad_sequences( [context_seq], maxlen=self.max_context_len, padding="post" ) token_padded = pad_sequences( [token_seq], maxlen=self.max_token_len, padding="post" ) ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post") srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post") # Q-type one-hot encoding q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0) q_type_one_hot = tf.keras.utils.to_categorical( [q_type_idx], num_classes=self.q_type_vocab_size ) # Predict pred = self.model.predict( [context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot] ) # Convert prediction to words pred_seq = np.argmax(pred[0], axis=1) # Convert indices to words reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()} pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0] return " ".join(pred_words) def batch_predict_questions(self, data): """ Predict questions for a batch of data Args: data (list): List of dictionaries with context, tokens, ner, srl, and answers Returns: list: List of predicted questions """ results = [] for item in data: context = item["context"] tokens = item["tokens"] ner = item["ner"] srl = item["srl"] # If there are Q&A pairs, use them for evaluation if "qas" in item: for qa in item["qas"]: q_type = qa["type"] ground_truth = qa["question"] predicted_question = self.predict_question( context, tokens, ner, srl, q_type ) results.append( { "context": context, "predicted_question": predicted_question, "ground_truth": ground_truth, "question_type": q_type, } ) else: # If no Q&A pairs, generate questions for all question types for q_type in ["isian", "true_false", "opsi"]: # For demo purposes, use a placeholder answer (would need actual answers in real use) # In practice, you might extract potential answers from the context placeholders = { "isian": "placeholder", "true_false": "true", "opsi": "placeholder", } predicted_question = self.predict_question( context, placeholders[q_type], tokens, ner, srl, q_type ) results.append( { "context": context, "predicted_question": predicted_question, "question_type": q_type, } ) return results # Example usage if __name__ == "__main__": # Load test data with open("../dataset/conteks_question.json", "r") as f: test_data = json.load(f) # Initialize model question_predictor = QuestionPredictionModel( model_path="question_prediction_model_final.h5", tokenizer_path="question_prediction_tokenizers.json", ) # Example single prediction sample = test_data[1] context = sample["context"] tokens = sample["tokens"] ner = sample["ner"] srl = sample["srl"] answer = sample["qas"][0]["answer"] q_type = sample["qas"][0]["type"] predicted_question = question_predictor.predict_question( context, tokens, ner, srl, q_type ) print(f"Context: {context}") print(f"Answer: {answer}") print(f"Question Type: {q_type}") print(f"Predicted Question: {predicted_question}") print(f"Ground Truth: {sample['qas'][0]['question']}") # Batch prediction # results = question_predictor.batch_predict_questions(test_data[:3]) # print("\nBatch Results:") # for i, result in enumerate(results): # print(f"\nResult {i+1}:") # print(f"Context: {result['context']}") # print(f"Answer: {result.get('answer', 'N/A')}") # print(f"Question Type: {result['question_type']}") # print(f"Predicted Question: {result['predicted_question']}") # if "ground_truth" in result: # print(f"Ground Truth: {result['ground_truth']}")