import numpy as np import json import tensorflow as tf from tensorflow.keras.preprocessing.text import tokenizer_from_json from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import load_model import re class QuestionPredictionModel: def __init__(self, model_path, tokenizer_path): """ Initialize question prediction model with pre-trained model and tokenizers """ # Load model self.model = load_model(model_path) # Load tokenizers with open(tokenizer_path, 'r') as f: tokenizer_data = json.load(f) # Reconstruct tokenizers self.word_tokenizer = tokenizer_from_json(tokenizer_data['word_tokenizer']) self.ner_tokenizer = tokenizer_from_json(tokenizer_data['ner_tokenizer']) self.srl_tokenizer = tokenizer_from_json(tokenizer_data['srl_tokenizer']) self.q_type_tokenizer = tokenizer_from_json(tokenizer_data['q_type_tokenizer']) # Get max lengths self.max_context_len = tokenizer_data['max_context_len'] self.max_answer_len = tokenizer_data['max_answer_len'] self.max_question_len = tokenizer_data['max_question_len'] self.max_token_len = tokenizer_data['max_token_len'] # Get vocabulary sizes self.vocab_size = len(self.word_tokenizer.word_index) + 1 self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1 def preprocess_text(self, text): """Basic text preprocessing""" text = text.lower() text = re.sub(r"\s+", " ", text).strip() return text def predict_question(self, context, answer, tokens, ner, srl, q_type): """ Predict a question based on given context, answer, tokens, NER, SRL, and question type Args: context (str): The context text answer (str): The answer to generate a question for tokens (list): List of tokens ner (list): List of NER tags corresponding to tokens srl (list): List of SRL tags corresponding to tokens q_type (str): Question type ('isian', 'opsi', or 'true_false') Returns: str: The predicted question """ # Preprocess inputs context = self.preprocess_text(context) answer = self.preprocess_text(answer) # Convert to sequences context_seq = self.word_tokenizer.texts_to_sequences([context])[0] answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0] tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0] ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0] srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0] # Pad sequences context_padded = pad_sequences([context_seq], maxlen=self.max_context_len, padding="post") answer_padded = pad_sequences([answer_seq], maxlen=self.max_answer_len, padding="post") tokens_padded = pad_sequences([tokens_seq], maxlen=self.max_token_len, padding="post") ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post") srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post") # One-hot encode question type q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0) q_type_categorical = tf.keras.utils.to_categorical( [q_type_idx], num_classes=self.q_type_vocab_size ) # Make prediction predicted_seq = self.model.predict( [context_padded, answer_padded, tokens_padded, ner_padded, srl_padded, q_type_categorical] ) # Convert predictions to tokens (taking the highest probability token at each position) predicted_indices = np.argmax(predicted_seq[0], axis=1) # Create reversed word index for converting indices back to words reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()} # Convert indices to words predicted_words = [] for idx in predicted_indices: if idx != 0: # Skip padding tokens predicted_words.append(reverse_word_index.get(idx, '')) # Form the question predicted_question = ' '.join(predicted_words) # Add "___" to the end based on question type convention if "___" not in predicted_question: predicted_question += " ___" return predicted_question def batch_predict_questions(self, data): """ Predict questions for a batch of data Args: data (list): List of dictionaries with context, tokens, ner, srl, and answers Returns: list: List of predicted questions """ results = [] for item in data: context = item["context"] tokens = item["tokens"] ner = item["ner"] srl = item["srl"] # If there are Q&A pairs, use them for evaluation if "qas" in item: for qa in item["qas"]: answer = qa["answer"] q_type = qa["type"] ground_truth = qa["question"] predicted_question = self.predict_question( context, answer, tokens, ner, srl, q_type ) results.append({ "context": context, "answer": answer, "predicted_question": predicted_question, "ground_truth": ground_truth, "question_type": q_type }) else: # If no Q&A pairs, generate questions for all question types for q_type in ["isian", "true_false", "opsi"]: # For demo purposes, use a placeholder answer (would need actual answers in real use) # In practice, you might extract potential answers from the context placeholders = { "isian": "placeholder", "true_false": "true", "opsi": "placeholder" } predicted_question = self.predict_question( context, placeholders[q_type], tokens, ner, srl, q_type ) results.append({ "context": context, "predicted_question": predicted_question, "question_type": q_type }) return results # Example usage if __name__ == "__main__": # Load test data with open("data_converted.json", "r") as f: test_data = json.load(f) # Initialize model question_predictor = QuestionPredictionModel( model_path="question_prediction_model_final.h5", tokenizer_path="question_prediction_tokenizers.json" ) # Example single prediction sample = test_data[0] context = sample["context"] tokens = sample["tokens"] ner = sample["ner"] srl = sample["srl"] answer = sample["qas"][0]["answer"] q_type = sample["qas"][0]["type"] predicted_question = question_predictor.predict_question( context, answer, tokens, ner, srl, q_type ) print(f"Context: {context}") print(f"Answer: {answer}") print(f"Question Type: {q_type}") print(f"Predicted Question: {predicted_question}") print(f"Ground Truth: {sample['qas'][0]['question']}") # Batch prediction results = question_predictor.batch_predict_questions(test_data[:3]) print("\nBatch Results:") for i, result in enumerate(results): print(f"\nResult {i+1}:") print(f"Context: {result['context']}") print(f"Answer: {result.get('answer', 'N/A')}") print(f"Question Type: {result['question_type']}") print(f"Predicted Question: {result['predicted_question']}") if 'ground_truth' in result: print(f"Ground Truth: {result['ground_truth']}")