feat: done creating question generate and answer generate
This commit is contained in:
parent
9378e4d145
commit
748a5b868f
|
@ -10,6 +10,8 @@ from app.repositories import (
|
|||
QuizMemoryRepository,
|
||||
AnswerMemoryRepository,
|
||||
ScoreMemoryRepository,
|
||||
QuestionGenerationRepository,
|
||||
AnswerGenerationRepository,
|
||||
)
|
||||
|
||||
from app.services import (
|
||||
|
@ -48,6 +50,8 @@ class Container(containers.DeclarativeContainer):
|
|||
subject_repository = providers.Factory(SubjectRepository, mongo.provided.db)
|
||||
session_repository = providers.Factory(SessionRepository, mongo.provided.db)
|
||||
ner_srl_repository = providers.Factory(NERSRLRepository)
|
||||
question_generation_repository = providers.Factory(QuestionGenerationRepository)
|
||||
answer_generator_repository = providers.Factory(AnswerGenerationRepository)
|
||||
session_memory_repository = providers.Factory(SessionMemoryRepository, redis)
|
||||
quiz_memory_repository = providers.Factory(QuizMemoryRepository, redis)
|
||||
answer_memory_repository = providers.Factory(AnswerMemoryRepository, redis)
|
||||
|
@ -101,7 +105,10 @@ class Container(containers.DeclarativeContainer):
|
|||
)
|
||||
|
||||
question_generation_service = providers.Factory(
|
||||
QuestionGenerationService, ner_srl_repository
|
||||
QuestionGenerationService,
|
||||
ner_srl_repository,
|
||||
question_generation_repository,
|
||||
answer_generator_repository,
|
||||
)
|
||||
|
||||
# controllers
|
||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -8,6 +8,8 @@ from .session_memory_repository import SessionMemoryRepository
|
|||
from .quiz_memory_repository import QuizMemoryRepository
|
||||
from .answer_memory_repository import AnswerMemoryRepository
|
||||
from .score_memory_repository import ScoreMemoryRepository
|
||||
from .question_generation_repository import QuestionGenerationRepository
|
||||
from .answer_generation_repository import AnswerGenerationRepository
|
||||
|
||||
__all__ = [
|
||||
"UserRepository",
|
||||
|
@ -20,4 +22,6 @@ __all__ = [
|
|||
"QuizMemoryRepository",
|
||||
"AnswerMemoryRepository",
|
||||
"ScoreMemoryRepository",
|
||||
"QuestionGenerationRepository",
|
||||
"AnswerGenerationRepository",
|
||||
]
|
||||
|
|
|
@ -8,8 +8,8 @@ import re
|
|||
|
||||
|
||||
class AnswerGenerationRepository:
|
||||
MODEL_PATH = "app/lstm_model/question_generation/qa_lstm_model_final.h5"
|
||||
TOKENIZER_PATH = "app/lstm_model/question_generation/qa_tokenizers.json"
|
||||
MODEL_PATH = "app/lstm_model/question_generation/qa_lstm_model_final_v2.keras"
|
||||
TOKENIZER_PATH = "app/lstm_model/question_generation/qa_tokenizers_v2.json"
|
||||
|
||||
def __init__(self):
|
||||
with open(self.TOKENIZER_PATH, "r") as f:
|
||||
|
|
|
@ -40,8 +40,8 @@ class NERSRLRepository:
|
|||
|
||||
return {
|
||||
"tokens": tokens,
|
||||
"labels_ner": [self.idx2tag_ner[int(i)] for i in pred_ner],
|
||||
"labels_srl": [self.idx2tag_srl[int(i)] for i in pred_srl],
|
||||
"ner": [self.idx2tag_ner[int(i)] for i in pred_ner],
|
||||
"srl": [self.idx2tag_srl[int(i)] for i in pred_srl],
|
||||
}
|
||||
|
||||
def labeling_token(self, tokens: list[str]) -> dict:
|
||||
|
|
|
@ -9,8 +9,8 @@ import re
|
|||
|
||||
class QuestionGenerationRepository:
|
||||
# Static paths for model and tokenizer
|
||||
MODEL_PATH = "app/lstm_model/question_generation/question_prediction_model_final.h5"
|
||||
TOKENIZER_PATH = "app/lstm_model/question_generation/question_prediction_tokenizers.json"
|
||||
MODEL_PATH = "app/lstm_model/question_generation/new_model/question_prediction_model_final.h5"
|
||||
TOKENIZER_PATH = "app/lstm_model/question_generation/new_model/question_prediction_tokenizers.json"
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
|
@ -32,7 +32,6 @@ class QuestionGenerationRepository:
|
|||
|
||||
# Get max lengths
|
||||
self.max_context_len = tokenizer_data["max_context_len"]
|
||||
self.max_answer_len = tokenizer_data["max_answer_len"]
|
||||
self.max_question_len = tokenizer_data["max_question_len"]
|
||||
self.max_token_len = tokenizer_data["max_token_len"]
|
||||
|
||||
|
@ -46,29 +45,14 @@ class QuestionGenerationRepository:
|
|||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def predict_question(self, context, answer, tokens, ner, srl, q_type):
|
||||
"""
|
||||
Predict a question based on given context, answer, tokens, NER, SRL, and question type
|
||||
|
||||
Args:
|
||||
context (str): The context text
|
||||
answer (str): The answer to generate a question for
|
||||
tokens (list): List of tokens
|
||||
ner (list): List of NER tags corresponding to tokens
|
||||
srl (list): List of SRL tags corresponding to tokens
|
||||
q_type (str): Question type ('isian', 'opsi', or 'true_false')
|
||||
|
||||
Returns:
|
||||
str: The predicted question
|
||||
"""
|
||||
# Preprocess inputs
|
||||
def predict_question(self, context, tokens, ner, srl, q_type):
|
||||
"""Prediksi pertanyaan berdasarkan konteks dan fitur lainnya"""
|
||||
# Preprocess
|
||||
context = self.preprocess_text(context)
|
||||
answer = self.preprocess_text(answer)
|
||||
|
||||
# Convert to sequences
|
||||
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
|
||||
answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0]
|
||||
tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
|
@ -76,113 +60,28 @@ class QuestionGenerationRepository:
|
|||
context_padded = pad_sequences(
|
||||
[context_seq], maxlen=self.max_context_len, padding="post"
|
||||
)
|
||||
answer_padded = pad_sequences(
|
||||
[answer_seq], maxlen=self.max_answer_len, padding="post"
|
||||
)
|
||||
tokens_padded = pad_sequences(
|
||||
[tokens_seq], maxlen=self.max_token_len, padding="post"
|
||||
token_padded = pad_sequences(
|
||||
[token_seq], maxlen=self.max_token_len, padding="post"
|
||||
)
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
|
||||
|
||||
# One-hot encode question type
|
||||
# Q-type one-hot encoding
|
||||
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_one_hot = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=self.q_type_vocab_size
|
||||
)
|
||||
|
||||
# Make prediction
|
||||
predicted_seq = self.model.predict(
|
||||
[
|
||||
context_padded,
|
||||
answer_padded,
|
||||
tokens_padded,
|
||||
ner_padded,
|
||||
srl_padded,
|
||||
q_type_categorical,
|
||||
]
|
||||
# Predict
|
||||
pred = self.model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot]
|
||||
)
|
||||
|
||||
# Convert predictions to tokens (taking the highest probability token at each position)
|
||||
predicted_indices = np.argmax(predicted_seq[0], axis=1)
|
||||
|
||||
# Create reversed word index for converting indices back to words
|
||||
reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()}
|
||||
# Convert prediction to words
|
||||
pred_seq = np.argmax(pred[0], axis=1)
|
||||
|
||||
# Convert indices to words
|
||||
predicted_words = []
|
||||
for idx in predicted_indices:
|
||||
if idx != 0: # Skip padding tokens
|
||||
predicted_words.append(reverse_word_index.get(idx, ""))
|
||||
reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()}
|
||||
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
|
||||
|
||||
# Form the question
|
||||
predicted_question = " ".join(predicted_words)
|
||||
|
||||
# Add "___" to the end based on question type convention
|
||||
if "___" not in predicted_question:
|
||||
predicted_question += " ___"
|
||||
|
||||
return predicted_question
|
||||
|
||||
def batch_predict_questions(self, data):
|
||||
"""
|
||||
Predict questions for a batch of data
|
||||
|
||||
Args:
|
||||
data (list): List of dictionaries with context, tokens, ner, srl, and answers
|
||||
|
||||
Returns:
|
||||
list: List of predicted questions
|
||||
"""
|
||||
results = []
|
||||
|
||||
for item in data:
|
||||
context = item["context"]
|
||||
tokens = item["tokens"]
|
||||
ner = item["ner"]
|
||||
srl = item["srl"]
|
||||
|
||||
# If there are Q&A pairs, use them for evaluation
|
||||
if "qas" in item:
|
||||
for qa in item["qas"]:
|
||||
answer = qa["answer"]
|
||||
q_type = qa["type"]
|
||||
ground_truth = qa["question"]
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, answer, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"context": context,
|
||||
"answer": answer,
|
||||
"predicted_question": predicted_question,
|
||||
"ground_truth": ground_truth,
|
||||
"question_type": q_type,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If no Q&A pairs, generate questions for all question types
|
||||
for q_type in ["isian", "true_false", "opsi"]:
|
||||
# For demo purposes, use a placeholder answer (would need actual answers in real use)
|
||||
# In practice, you might extract potential answers from the context
|
||||
placeholders = {
|
||||
"isian": "placeholder",
|
||||
"true_false": "true",
|
||||
"opsi": "placeholder",
|
||||
}
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, placeholders[q_type], tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"question_type": q_type,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
return " ".join(pred_words)
|
||||
|
|
|
@ -1,25 +1,51 @@
|
|||
from app.repositories import NERSRLRepository
|
||||
from app.repositories import (
|
||||
NERSRLRepository,
|
||||
QuestionGenerationRepository,
|
||||
AnswerGenerationRepository,
|
||||
)
|
||||
import re
|
||||
|
||||
|
||||
class QuestionGenerationService:
|
||||
|
||||
def __init__(self, ner_srl_repository: NERSRLRepository):
|
||||
def __init__(
|
||||
self,
|
||||
ner_srl_repository: NERSRLRepository,
|
||||
question_generate_repository: QuestionGenerationRepository,
|
||||
answer_generate_repository: AnswerGenerationRepository,
|
||||
):
|
||||
self._ner_srl_repository = ner_srl_repository
|
||||
self._question_generation_repository = question_generate_repository
|
||||
self._answer_generation_repository = answer_generate_repository
|
||||
|
||||
def createQuizAutomate(self, sentence: str):
|
||||
# Gunakan regex untuk split hanya pada titik yang diikuti spasi atau akhir kalimat,
|
||||
# dan bukan bagian dari angka (contoh: 19.00 tidak dipisah)
|
||||
split_pattern = r"\.(?=\s|$)(?!\d)"
|
||||
|
||||
# Pisahkan kalimat menggunakan regex
|
||||
# split sentence using regex
|
||||
sentences = [s.strip() for s in re.split(split_pattern, sentence) if s.strip()]
|
||||
|
||||
results = []
|
||||
for s in sentences:
|
||||
result = self._ner_srl_repository.predict_sentence(s)
|
||||
results.append(result)
|
||||
|
||||
question = self._question_generation_repository.predict_question(
|
||||
context=s,
|
||||
ner=result["ner"],
|
||||
tokens=result["tokens"],
|
||||
srl=result["srl"],
|
||||
q_type=1,
|
||||
)
|
||||
|
||||
answer = self._answer_generation_repository.predict_answer(
|
||||
context=s,
|
||||
question=question,
|
||||
ner=result["ner"],
|
||||
tokens=result["tokens"],
|
||||
srl=result["srl"],
|
||||
q_type=1,
|
||||
)
|
||||
results.append({"qustion": question, "answer": answer})
|
||||
|
||||
return results
|
||||
|
|
Loading…
Reference in New Issue