feat: done creating question generate and answer generate

This commit is contained in:
akhdanre 2025-05-18 15:06:42 +07:00
parent 9378e4d145
commit 748a5b868f
8 changed files with 72 additions and 135 deletions

View File

@ -10,6 +10,8 @@ from app.repositories import (
QuizMemoryRepository,
AnswerMemoryRepository,
ScoreMemoryRepository,
QuestionGenerationRepository,
AnswerGenerationRepository,
)
from app.services import (
@ -48,6 +50,8 @@ class Container(containers.DeclarativeContainer):
subject_repository = providers.Factory(SubjectRepository, mongo.provided.db)
session_repository = providers.Factory(SessionRepository, mongo.provided.db)
ner_srl_repository = providers.Factory(NERSRLRepository)
question_generation_repository = providers.Factory(QuestionGenerationRepository)
answer_generator_repository = providers.Factory(AnswerGenerationRepository)
session_memory_repository = providers.Factory(SessionMemoryRepository, redis)
quiz_memory_repository = providers.Factory(QuizMemoryRepository, redis)
answer_memory_repository = providers.Factory(AnswerMemoryRepository, redis)
@ -101,7 +105,10 @@ class Container(containers.DeclarativeContainer):
)
question_generation_service = providers.Factory(
QuestionGenerationService, ner_srl_repository
QuestionGenerationService,
ner_srl_repository,
question_generation_repository,
answer_generator_repository,
)
# controllers

File diff suppressed because one or more lines are too long

View File

@ -8,6 +8,8 @@ from .session_memory_repository import SessionMemoryRepository
from .quiz_memory_repository import QuizMemoryRepository
from .answer_memory_repository import AnswerMemoryRepository
from .score_memory_repository import ScoreMemoryRepository
from .question_generation_repository import QuestionGenerationRepository
from .answer_generation_repository import AnswerGenerationRepository
__all__ = [
"UserRepository",
@ -20,4 +22,6 @@ __all__ = [
"QuizMemoryRepository",
"AnswerMemoryRepository",
"ScoreMemoryRepository",
"QuestionGenerationRepository",
"AnswerGenerationRepository",
]

View File

@ -8,8 +8,8 @@ import re
class AnswerGenerationRepository:
MODEL_PATH = "app/lstm_model/question_generation/qa_lstm_model_final.h5"
TOKENIZER_PATH = "app/lstm_model/question_generation/qa_tokenizers.json"
MODEL_PATH = "app/lstm_model/question_generation/qa_lstm_model_final_v2.keras"
TOKENIZER_PATH = "app/lstm_model/question_generation/qa_tokenizers_v2.json"
def __init__(self):
with open(self.TOKENIZER_PATH, "r") as f:

View File

@ -40,8 +40,8 @@ class NERSRLRepository:
return {
"tokens": tokens,
"labels_ner": [self.idx2tag_ner[int(i)] for i in pred_ner],
"labels_srl": [self.idx2tag_srl[int(i)] for i in pred_srl],
"ner": [self.idx2tag_ner[int(i)] for i in pred_ner],
"srl": [self.idx2tag_srl[int(i)] for i in pred_srl],
}
def labeling_token(self, tokens: list[str]) -> dict:

View File

@ -9,8 +9,8 @@ import re
class QuestionGenerationRepository:
# Static paths for model and tokenizer
MODEL_PATH = "app/lstm_model/question_generation/question_prediction_model_final.h5"
TOKENIZER_PATH = "app/lstm_model/question_generation/question_prediction_tokenizers.json"
MODEL_PATH = "app/lstm_model/question_generation/new_model/question_prediction_model_final.h5"
TOKENIZER_PATH = "app/lstm_model/question_generation/new_model/question_prediction_tokenizers.json"
def __init__(self):
"""
@ -32,7 +32,6 @@ class QuestionGenerationRepository:
# Get max lengths
self.max_context_len = tokenizer_data["max_context_len"]
self.max_answer_len = tokenizer_data["max_answer_len"]
self.max_question_len = tokenizer_data["max_question_len"]
self.max_token_len = tokenizer_data["max_token_len"]
@ -46,29 +45,14 @@ class QuestionGenerationRepository:
text = re.sub(r"\s+", " ", text).strip()
return text
def predict_question(self, context, answer, tokens, ner, srl, q_type):
"""
Predict a question based on given context, answer, tokens, NER, SRL, and question type
Args:
context (str): The context text
answer (str): The answer to generate a question for
tokens (list): List of tokens
ner (list): List of NER tags corresponding to tokens
srl (list): List of SRL tags corresponding to tokens
q_type (str): Question type ('isian', 'opsi', or 'true_false')
Returns:
str: The predicted question
"""
# Preprocess inputs
def predict_question(self, context, tokens, ner, srl, q_type):
"""Prediksi pertanyaan berdasarkan konteks dan fitur lainnya"""
# Preprocess
context = self.preprocess_text(context)
answer = self.preprocess_text(answer)
# Convert to sequences
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0]
tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
@ -76,113 +60,28 @@ class QuestionGenerationRepository:
context_padded = pad_sequences(
[context_seq], maxlen=self.max_context_len, padding="post"
)
answer_padded = pad_sequences(
[answer_seq], maxlen=self.max_answer_len, padding="post"
)
tokens_padded = pad_sequences(
[tokens_seq], maxlen=self.max_token_len, padding="post"
token_padded = pad_sequences(
[token_seq], maxlen=self.max_token_len, padding="post"
)
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
# One-hot encode question type
# Q-type one-hot encoding
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
q_type_categorical = tf.keras.utils.to_categorical(
q_type_one_hot = tf.keras.utils.to_categorical(
[q_type_idx], num_classes=self.q_type_vocab_size
)
# Make prediction
predicted_seq = self.model.predict(
[
context_padded,
answer_padded,
tokens_padded,
ner_padded,
srl_padded,
q_type_categorical,
]
# Predict
pred = self.model.predict(
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot]
)
# Convert predictions to tokens (taking the highest probability token at each position)
predicted_indices = np.argmax(predicted_seq[0], axis=1)
# Create reversed word index for converting indices back to words
reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()}
# Convert prediction to words
pred_seq = np.argmax(pred[0], axis=1)
# Convert indices to words
predicted_words = []
for idx in predicted_indices:
if idx != 0: # Skip padding tokens
predicted_words.append(reverse_word_index.get(idx, ""))
reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()}
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
# Form the question
predicted_question = " ".join(predicted_words)
# Add "___" to the end based on question type convention
if "___" not in predicted_question:
predicted_question += " ___"
return predicted_question
def batch_predict_questions(self, data):
"""
Predict questions for a batch of data
Args:
data (list): List of dictionaries with context, tokens, ner, srl, and answers
Returns:
list: List of predicted questions
"""
results = []
for item in data:
context = item["context"]
tokens = item["tokens"]
ner = item["ner"]
srl = item["srl"]
# If there are Q&A pairs, use them for evaluation
if "qas" in item:
for qa in item["qas"]:
answer = qa["answer"]
q_type = qa["type"]
ground_truth = qa["question"]
predicted_question = self.predict_question(
context, answer, tokens, ner, srl, q_type
)
results.append(
{
"context": context,
"answer": answer,
"predicted_question": predicted_question,
"ground_truth": ground_truth,
"question_type": q_type,
}
)
else:
# If no Q&A pairs, generate questions for all question types
for q_type in ["isian", "true_false", "opsi"]:
# For demo purposes, use a placeholder answer (would need actual answers in real use)
# In practice, you might extract potential answers from the context
placeholders = {
"isian": "placeholder",
"true_false": "true",
"opsi": "placeholder",
}
predicted_question = self.predict_question(
context, placeholders[q_type], tokens, ner, srl, q_type
)
results.append(
{
"context": context,
"predicted_question": predicted_question,
"question_type": q_type,
}
)
return results
return " ".join(pred_words)

View File

@ -1,25 +1,51 @@
from app.repositories import NERSRLRepository
from app.repositories import (
NERSRLRepository,
QuestionGenerationRepository,
AnswerGenerationRepository,
)
import re
class QuestionGenerationService:
def __init__(self, ner_srl_repository: NERSRLRepository):
def __init__(
self,
ner_srl_repository: NERSRLRepository,
question_generate_repository: QuestionGenerationRepository,
answer_generate_repository: AnswerGenerationRepository,
):
self._ner_srl_repository = ner_srl_repository
self._question_generation_repository = question_generate_repository
self._answer_generation_repository = answer_generate_repository
def createQuizAutomate(self, sentence: str):
# Gunakan regex untuk split hanya pada titik yang diikuti spasi atau akhir kalimat,
# dan bukan bagian dari angka (contoh: 19.00 tidak dipisah)
split_pattern = r"\.(?=\s|$)(?!\d)"
# Pisahkan kalimat menggunakan regex
# split sentence using regex
sentences = [s.strip() for s in re.split(split_pattern, sentence) if s.strip()]
results = []
for s in sentences:
result = self._ner_srl_repository.predict_sentence(s)
results.append(result)
question = self._question_generation_repository.predict_question(
context=s,
ner=result["ner"],
tokens=result["tokens"],
srl=result["srl"],
q_type=1,
)
answer = self._answer_generation_repository.predict_answer(
context=s,
question=question,
ner=result["ner"],
tokens=result["tokens"],
srl=result["srl"],
q_type=1,
)
results.append({"qustion": question, "answer": answer})
return results