feat: adding new model training
This commit is contained in:
parent
ad4b6d6137
commit
f0f6f412bb
|
@ -0,0 +1,424 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Attention,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
import string
|
||||
from collections import Counter
|
||||
|
||||
# Data contoh yang diberikan
|
||||
# data = [
|
||||
# {
|
||||
# "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
||||
# "tokens": [
|
||||
# "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara"
|
||||
# ],
|
||||
# "ner": [
|
||||
# "PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"
|
||||
# ],
|
||||
# "srl": [
|
||||
# "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
|
||||
# ],
|
||||
# "qas": [
|
||||
# {
|
||||
# "type": "isian",
|
||||
# "question": "Dimana kartini lahir ___",
|
||||
# "answer": "jepara",
|
||||
# "id": "qa_0_q1"
|
||||
# },
|
||||
# {
|
||||
# "type": "true_false",
|
||||
# "question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
||||
# "options": ["true", "false"],
|
||||
# "answer": "false",
|
||||
# "id": "qa_0_q2"
|
||||
# }
|
||||
# ]
|
||||
# },
|
||||
# {
|
||||
# "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
# "tokens": [
|
||||
# "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan"
|
||||
# ],
|
||||
# "ner": [
|
||||
# "O", "ORG", "O", "O", "O", "DATE", "O", "LOC"
|
||||
# ],
|
||||
# "srl": [
|
||||
# "ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"
|
||||
# ],
|
||||
# "qas": [
|
||||
# {
|
||||
# "type": "opsi",
|
||||
# "question": "Dimana kerajaan majapahit berdiri ___",
|
||||
# "options": ["trowulan", "singasari", "kuta", "banten"],
|
||||
# "answer": "trowulan",
|
||||
# "id": "qa_1_q1"
|
||||
# },
|
||||
# {
|
||||
# "type": "true_false",
|
||||
# "question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
||||
# "options": ["true", "false"],
|
||||
# "answer": "false",
|
||||
# "id": "qa_1_q2"
|
||||
# }
|
||||
# ]
|
||||
# },
|
||||
# {
|
||||
# "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
|
||||
# "tokens": [
|
||||
# "soekarno", "dan", "mohammad", "hatta", "memproklamasikan", "kemerdekaan", "indonesia", "pada", "17", "agustus", "1945"
|
||||
# ],
|
||||
# "ner": [
|
||||
# "PER", "O", "PER", "PER", "O", "O", "LOC", "O", "DATE", "DATE", "DATE"
|
||||
# ],
|
||||
# "srl": [
|
||||
# "ARG0", "O", "ARG0", "ARG0", "V", "ARG1", "ARGM-LOC", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP"
|
||||
# ],
|
||||
# "qas": [
|
||||
# {
|
||||
# "type": "isian",
|
||||
# "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
|
||||
# "answer": "17 agustus 1945",
|
||||
# "id": "qa_2_q1"
|
||||
# },
|
||||
# {
|
||||
# "type": "opsi",
|
||||
# "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
|
||||
# "options": ["soekarno", "mohammad hatta", "sudirman", "ahmad yani"],
|
||||
# "answer": "soekarno mohammad hatta",
|
||||
# "id": "qa_2_q2"
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# ]
|
||||
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
|
||||
|
||||
# # Simpan ke file JSON untuk kebutuhan di masa depan
|
||||
# with read('qa_dataset.json', 'w', encoding='utf-8') as f:
|
||||
# json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model
|
||||
def prepare_data(data):
|
||||
"""Siapkan data untuk model"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
answers = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
answers.append(qa["answer"])
|
||||
q_types.append(qa["type"])
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
|
||||
|
||||
|
||||
# Siapkan data
|
||||
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
|
||||
data
|
||||
)
|
||||
|
||||
# Tokenizer untuk teks (context dan question)
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
return context_padded, question_padded, token_padded, ner_padded, srl_padded
|
||||
|
||||
|
||||
# Siapkan encoder untuk jawaban
|
||||
answer_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
answer_tokenizer.fit_on_texts(answers)
|
||||
answer_vocab_size = len(answer_tokenizer.word_index) + 1
|
||||
|
||||
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
|
||||
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
|
||||
)
|
||||
|
||||
# Encode jawaban
|
||||
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
|
||||
max_answer_len = max([len(seq) for seq in answer_seqs])
|
||||
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_answer = get_subset(answer_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_answer = get_subset(answer_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model
|
||||
def create_qa_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
question_input = Input(shape=(max_question_len,), name="question_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
question_embed = text_embedding(question_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
question_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="question_lstm")
|
||||
)(question_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
# Attention mechanism untuk context dengan memperhatikan question
|
||||
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
||||
[context_lstm, question_lstm]
|
||||
)
|
||||
|
||||
# Pool attention outputs
|
||||
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
||||
context_attention
|
||||
)
|
||||
question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")(
|
||||
question_lstm
|
||||
)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features
|
||||
all_features = Concatenate(name="all_features")(
|
||||
[context_att_pool, question_pool, token_pool, q_type_input]
|
||||
)
|
||||
|
||||
# Dense layers
|
||||
x = Dense(256, activation="relu", name="dense_1")(all_features)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
x = Dense(128, activation="relu", name="dense_2")(x)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
|
||||
# Output layer untuk jawaban
|
||||
answer_output = Dense(
|
||||
answer_vocab_size, activation="softmax", name="answer_output"
|
||||
)(x)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
question_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
q_type_input,
|
||||
],
|
||||
outputs=answer_output,
|
||||
)
|
||||
|
||||
# Compile model
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Buat model
|
||||
model = create_qa_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
|
||||
|
||||
# Training
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Ubah format jawaban untuk sparse categorical crossentropy
|
||||
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
|
||||
test_answer_labels = test_answer[:, 0]
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_question, train_token, train_ner, train_srl, train_q_type],
|
||||
train_answer_labels,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_question, test_token, test_ner, test_srl, test_q_type],
|
||||
test_answer_labels,
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
|
||||
# Simpan model dan tokenizer
|
||||
model.save("qa_lstm_model_final.h5")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"answer_tokenizer": answer_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("qa_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
print("Model dan tokenizer berhasil disimpan!")
|
|
@ -0,0 +1,151 @@
|
|||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.models import load_model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
||||
import re
|
||||
import random
|
||||
|
||||
# Load tokenizers and model configurations
|
||||
with open("qa_tokenizers.json", "r") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
|
||||
ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
|
||||
srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
|
||||
answer_tokenizer = tokenizer_from_json(tokenizer_data["answer_tokenizer"])
|
||||
q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
|
||||
|
||||
max_context_len = tokenizer_data["max_context_len"]
|
||||
max_question_len = tokenizer_data["max_question_len"]
|
||||
max_token_len = tokenizer_data["max_token_len"]
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
# Load trained model
|
||||
model = load_model("qa_lstm_model_final.h5")
|
||||
|
||||
def preprocess_text(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def predict_answer(context, question, tokens, ner, srl, q_type):
|
||||
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
|
||||
question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
|
||||
token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
|
||||
ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
|
||||
srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
|
||||
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_cat = tf.keras.utils.to_categorical([q_type_idx], num_classes=q_type_vocab_size)
|
||||
|
||||
# Pad sequences
|
||||
context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
|
||||
question_pad = pad_sequences(question_seq, maxlen=max_question_len, padding="post")
|
||||
token_pad = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
|
||||
ner_pad = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
|
||||
srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
# Predict
|
||||
prediction = model.predict([context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0)
|
||||
answer_idx = np.argmax(prediction[0])
|
||||
|
||||
# Retrieve predicted answer word
|
||||
for word, idx in answer_tokenizer.word_index.items():
|
||||
if idx == answer_idx:
|
||||
return word
|
||||
|
||||
return "Unknown"
|
||||
|
||||
def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
|
||||
entities = {}
|
||||
predicate = ""
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
if ner[i] != "O":
|
||||
entities.setdefault(ner[i], []).append(token)
|
||||
if srl[i] == "V":
|
||||
predicate = token
|
||||
elif srl[i].startswith("ARG"):
|
||||
entities.setdefault(srl[i], []).append(token)
|
||||
|
||||
subject = " ".join(entities.get("ARG0", [""]))
|
||||
|
||||
if question_type == "isian":
|
||||
if "LOC" in entities:
|
||||
location = " ".join(entities["LOC"])
|
||||
return f"Dimana {subject} {predicate} ___", location
|
||||
elif "DATE" in entities:
|
||||
date = " ".join(entities["DATE"])
|
||||
return f"Kapan {subject} {predicate} ___", date
|
||||
|
||||
elif question_type == "true_false":
|
||||
if "DATE" in entities:
|
||||
original_date = " ".join(entities["DATE"])
|
||||
try:
|
||||
modified_year = str(int(entities['DATE'][-1]) + random.randint(1, 5))
|
||||
modified_date = f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
|
||||
except:
|
||||
modified_date = original_date # Fallback if parsing fails
|
||||
return f"{subject} {predicate} pada {modified_date} ___", "false"
|
||||
|
||||
elif question_type == "opsi":
|
||||
if "LOC" in entities:
|
||||
correct_location = " ".join(entities["LOC"])
|
||||
distractors = ["singasari", "kuta", "banten", "kediri", "makassar"]
|
||||
distractors = [d for d in distractors if d != correct_location]
|
||||
options = random.sample(distractors, 3) + [correct_location]
|
||||
random.shuffle(options)
|
||||
return f"Dimana {subject} {predicate} ___", options, correct_location
|
||||
|
||||
return "Apa yang terjadi dalam teks ini ___", context
|
||||
|
||||
# ✅ Example Usage with Random Sampling
|
||||
if __name__ == "__main__":
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Randomly select an example for testing
|
||||
test_item = random.choice(data)
|
||||
test_qa = random.choice(test_item["qas"])
|
||||
|
||||
predicted_answer = predict_answer(
|
||||
test_item["context"],
|
||||
test_qa["question"],
|
||||
test_item["tokens"],
|
||||
test_item["ner"],
|
||||
test_item["srl"],
|
||||
test_qa["type"]
|
||||
)
|
||||
|
||||
print(f"Context: {test_item['context']}")
|
||||
print(f"Question: {test_qa['question']}")
|
||||
print(f"True Answer: {test_qa['answer']}")
|
||||
print(f"Predicted Answer: {predicted_answer}")
|
||||
|
||||
# Generate Random Question Example
|
||||
example_context = test_item["context"]
|
||||
example_tokens = test_item["tokens"]
|
||||
example_ner = test_item["ner"]
|
||||
example_srl = test_item["srl"]
|
||||
|
||||
random_question_type = random.choice(["isian", "true_false", "opsi"])
|
||||
|
||||
result = generate_question_answer(
|
||||
example_context, example_tokens, example_ner, example_srl, random_question_type
|
||||
)
|
||||
|
||||
print("\nGenerated Question Example:")
|
||||
print(f"Context: {example_context}")
|
||||
print(f"Question Type: {random_question_type}")
|
||||
|
||||
if random_question_type == "opsi":
|
||||
question, options, correct_answer = result
|
||||
print(f"Generated Question: {question}")
|
||||
print(f"Options: {options}")
|
||||
print(f"Correct Answer: {correct_answer}")
|
||||
else:
|
||||
question, answer = result
|
||||
print(f"Generated Question: {question}")
|
||||
print(f"Answer: {answer}")
|
|
@ -0,0 +1,54 @@
|
|||
import json
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
def normalize_question(text):
|
||||
text = re.sub(r'\s+([?.!,])', r'\1', text)
|
||||
return text.capitalize()
|
||||
|
||||
# Load data
|
||||
with open('../dataset/dev_dataset_qg.json', 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
processed_data = []
|
||||
|
||||
for idx_entry, entry in enumerate(data):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
|
||||
if "context" not in entry:
|
||||
entry["context"] = " ".join(entry.get("tokens", []))
|
||||
|
||||
# Update NER tags: ubah 'V' menjadi 'O'
|
||||
ner_tags = entry.get("ner", [])
|
||||
entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]
|
||||
|
||||
for idx_qa, qa in enumerate(entry.get("qas", [])):
|
||||
if "id" not in qa:
|
||||
qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"
|
||||
|
||||
answer = qa.get("answer")
|
||||
if isinstance(answer, list):
|
||||
qa["answer"] = " ".join(answer)
|
||||
|
||||
question = qa.get("question")
|
||||
if isinstance(question, list):
|
||||
question_str = " ".join(question)
|
||||
qa["question"] = normalize_question(question_str)
|
||||
|
||||
# Reorder fields: tokens first, then the rest
|
||||
ordered_entry = OrderedDict()
|
||||
if "context" in entry:
|
||||
ordered_entry["context"] = entry.pop("context")
|
||||
# Add remaining fields in their original order
|
||||
for key, value in entry.items():
|
||||
ordered_entry[key] = value
|
||||
|
||||
processed_data.append(ordered_entry)
|
||||
|
||||
# Save result
|
||||
with open('data_converted.json', 'w', encoding='utf-8') as file:
|
||||
json.dump(processed_data, file, indent=2, ensure_ascii=False)
|
||||
|
||||
# Optional: Print first 2 entries for quick verification
|
||||
print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))
|
|
@ -0,0 +1,53 @@
|
|||
[
|
||||
{
|
||||
"context": "Raden Ajeng Kartini lahir pada 21 April 1879 di Jepara.",
|
||||
"tokens": [
|
||||
"raden",
|
||||
"ajeng",
|
||||
"kartini",
|
||||
"lahir",
|
||||
"pada",
|
||||
"21",
|
||||
"april",
|
||||
"1879",
|
||||
"di",
|
||||
"jepara"
|
||||
],
|
||||
"ner_tags": [
|
||||
"PER",
|
||||
"PER",
|
||||
"PER",
|
||||
"V",
|
||||
"O",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"O",
|
||||
"LOC"
|
||||
],
|
||||
"srl_tags": [
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"O",
|
||||
"ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"id": "kartini_001_q1",
|
||||
"question": "Dimana Kartini lahir?",
|
||||
"answers": [{ "text": "Jepara", "answer_start": 10 }]
|
||||
},
|
||||
{
|
||||
"id": "kartini_001_q2",
|
||||
"question": "Kartini lahir pada tanggal ___?",
|
||||
"answers": [{ "text": "21 April 1879", "answer_start": 6 }]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
BLEU Score: 0.0585
|
||||
Validation Accuracy: 0.6740
|
||||
Validation Loss: 1.8080
|
|
@ -0,0 +1,178 @@
|
|||
[
|
||||
{
|
||||
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
||||
"tokens": [
|
||||
"raden",
|
||||
"ajeng",
|
||||
"kartini",
|
||||
"lahir",
|
||||
"pada",
|
||||
"21",
|
||||
"april",
|
||||
"1879",
|
||||
"di",
|
||||
"jepara"
|
||||
],
|
||||
"ner": [
|
||||
"PER",
|
||||
"PER",
|
||||
"PER",
|
||||
"O",
|
||||
"O",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"O",
|
||||
"LOC"
|
||||
],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"O",
|
||||
"ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "isian",
|
||||
"question": "Dimana kartini lahir ___",
|
||||
"answer": "jepara",
|
||||
"id": "qa_0_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
||||
"options": [
|
||||
"true",
|
||||
"false"
|
||||
],
|
||||
"answer": "false",
|
||||
"id": "qa_0_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
"tokens": [
|
||||
"kerajaan",
|
||||
"majapahit",
|
||||
"berdiri",
|
||||
"pada",
|
||||
"tahun",
|
||||
"1293",
|
||||
"di",
|
||||
"trowulan"
|
||||
],
|
||||
"ner": [
|
||||
"O",
|
||||
"ORG",
|
||||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"DATE",
|
||||
"O",
|
||||
"LOC"
|
||||
],
|
||||
"srl": [
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"V",
|
||||
"O",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"O",
|
||||
"ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "opsi",
|
||||
"question": "Dimana kerajaan majapahit berdiri ___",
|
||||
"options": [
|
||||
"trowulan",
|
||||
"singasari",
|
||||
"kuta",
|
||||
"banten"
|
||||
],
|
||||
"answer": "trowulan",
|
||||
"id": "qa_1_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
||||
"options": [
|
||||
"true",
|
||||
"false"
|
||||
],
|
||||
"answer": "false",
|
||||
"id": "qa_1_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
|
||||
"tokens": [
|
||||
"soekarno",
|
||||
"dan",
|
||||
"mohammad",
|
||||
"hatta",
|
||||
"memproklamasikan",
|
||||
"kemerdekaan",
|
||||
"indonesia",
|
||||
"pada",
|
||||
"17",
|
||||
"agustus",
|
||||
"1945"
|
||||
],
|
||||
"ner": [
|
||||
"PER",
|
||||
"O",
|
||||
"PER",
|
||||
"PER",
|
||||
"O",
|
||||
"O",
|
||||
"LOC",
|
||||
"O",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"DATE"
|
||||
],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"O",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"ARG1",
|
||||
"ARGM-LOC",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "isian",
|
||||
"question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
|
||||
"answer": "17 agustus 1945",
|
||||
"id": "qa_2_q1"
|
||||
},
|
||||
{
|
||||
"type": "opsi",
|
||||
"question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
|
||||
"options": [
|
||||
"soekarno",
|
||||
"mohammad hatta",
|
||||
"sudirman",
|
||||
"ahmad yani"
|
||||
],
|
||||
"answer": "soekarno mohammad hatta",
|
||||
"id": "qa_2_q2"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,490 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Attention,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
|
||||
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model
|
||||
def prepare_data(data):
|
||||
"""Siapkan data untuk model"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
answers = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
answers.append(qa["answer"])
|
||||
q_types.append(qa["type"])
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
|
||||
|
||||
|
||||
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
|
||||
data
|
||||
)
|
||||
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
return context_padded, question_padded, token_padded, ner_padded, srl_padded
|
||||
|
||||
|
||||
# Siapkan encoder untuk jawaban
|
||||
answer_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
answer_tokenizer.fit_on_texts(answers)
|
||||
answer_vocab_size = len(answer_tokenizer.word_index) + 1
|
||||
|
||||
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
|
||||
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
|
||||
)
|
||||
|
||||
# Encode jawaban
|
||||
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
|
||||
max_answer_len = max([len(seq) for seq in answer_seqs])
|
||||
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_answer = get_subset(answer_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_answer = get_subset(answer_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model dengan dua output: pertanyaan dan jawaban
|
||||
def create_qa_generator_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
|
||||
# Tidak perlu question_input dan q_type_input untuk proses generasi
|
||||
# karena ini akan menjadi output
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
# Pool outputs
|
||||
context_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_pool")(context_lstm)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features
|
||||
all_features = Concatenate(name="all_features")([context_pool, token_pool])
|
||||
|
||||
# Shared layers
|
||||
shared = Dense(256, activation="relu", name="shared_dense_1")(all_features)
|
||||
shared = Dropout(dropout_rate)(shared)
|
||||
shared = Dense(128, activation="relu", name="shared_dense_2")(shared)
|
||||
shared = Dropout(dropout_rate)(shared)
|
||||
|
||||
# Branch untuk pertanyaan
|
||||
question_branch = Dense(256, activation="relu", name="question_dense")(shared)
|
||||
question_branch = Dropout(dropout_rate)(question_branch)
|
||||
|
||||
# Branch untuk jawaban
|
||||
answer_branch = Dense(256, activation="relu", name="answer_dense")(shared)
|
||||
answer_branch = Dropout(dropout_rate)(answer_branch)
|
||||
|
||||
# Output layers
|
||||
# Untuk pertanyaan, kita buat layer decoder berbasis LSTM untuk menghasilkan sequence kata
|
||||
# sebagai pertanyaan
|
||||
question_decoder = LSTM(lstm_units, return_sequences=True, name="question_decoder")(
|
||||
tf.keras.layers.RepeatVector(max_question_len)(question_branch)
|
||||
)
|
||||
question_output = Dense(vocab_size, activation="softmax", name="question_output")(
|
||||
question_decoder
|
||||
)
|
||||
|
||||
# Output layer untuk jawaban
|
||||
answer_output = Dense(
|
||||
answer_vocab_size, activation="softmax", name="answer_output"
|
||||
)(answer_branch)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
],
|
||||
outputs=[question_output, answer_output],
|
||||
)
|
||||
|
||||
# Compile model dengan loss function dan metrics untuk kedua output
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss={
|
||||
"question_output": "categorical_crossentropy",
|
||||
"answer_output": "sparse_categorical_crossentropy",
|
||||
},
|
||||
metrics={"question_output": "accuracy", "answer_output": "accuracy"},
|
||||
loss_weights={"question_output": 1.0, "answer_output": 1.0},
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Persiapkan target untuk pertanyaan (one-hot encoded)
|
||||
# Untuk pertanyaan, kita perlu mengubah ke format categorical karena kita memprediksi
|
||||
# setiap kata di sequence secara bersamaan
|
||||
def prepare_question_target(question_padded):
|
||||
question_target = []
|
||||
for question in question_padded:
|
||||
# One-hot encode setiap token dalam sequence
|
||||
sequence_target = []
|
||||
for token in question:
|
||||
# Buat vektor one-hot untuk token ini
|
||||
token_target = tf.keras.utils.to_categorical(token, num_classes=vocab_size)
|
||||
sequence_target.append(token_target)
|
||||
question_target.append(sequence_target)
|
||||
return np.array(question_target)
|
||||
|
||||
|
||||
# Siapkan target untuk question output
|
||||
train_question_target = prepare_question_target(train_question)
|
||||
test_question_target = prepare_question_target(test_question)
|
||||
|
||||
# Ubah format jawaban untuk sparse categorical crossentropy
|
||||
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
|
||||
test_answer_labels = test_answer[:, 0]
|
||||
|
||||
# Buat model
|
||||
model = create_qa_generator_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"qa_generator_model.h5",
|
||||
monitor="val_question_output_accuracy",
|
||||
save_best_only=True,
|
||||
verbose=1,
|
||||
mode="max",
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(
|
||||
monitor="val_question_output_accuracy", patience=5, verbose=1, mode="max"
|
||||
)
|
||||
|
||||
# Training
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_token, train_ner, train_srl],
|
||||
{"question_output": train_question_target, "answer_output": train_answer_labels},
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_token, test_ner, test_srl],
|
||||
{"question_output": test_question_target, "answer_output": test_answer_labels},
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
model.save("qa_generator_model_final.keras")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"answer_tokenizer": answer_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("qa_generator_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
|
||||
# Fungsi untuk prediksi
|
||||
def predict_question_and_answer(model, context, tokens, ner, srl):
|
||||
"""
|
||||
Prediksi pertanyaan dan jawaban berdasarkan konteks, tokens, NER, dan SRL
|
||||
"""
|
||||
# Preprocess input
|
||||
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
|
||||
context_padded = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
|
||||
|
||||
token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])
|
||||
token_padded = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])
|
||||
ner_padded = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])
|
||||
srl_padded = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
# Prediksi
|
||||
question_pred, answer_pred = model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded]
|
||||
)
|
||||
|
||||
# Decode pertanyaan (mengambil indeks dengan probabilitas tertinggi di setiap posisi)
|
||||
question_indices = np.argmax(question_pred[0], axis=1)
|
||||
question_words = []
|
||||
|
||||
# Reverse word index untuk mendapatkan kata dari indeks
|
||||
word_index = tokenizer.word_index
|
||||
index_word = {v: k for k, v in word_index.items()}
|
||||
|
||||
# Decode pertanyaan
|
||||
for idx in question_indices:
|
||||
if idx != 0: # Skip padding (index 0)
|
||||
word = index_word.get(idx, "<UNK>")
|
||||
question_words.append(word)
|
||||
else:
|
||||
break # Stop at padding
|
||||
|
||||
# Decode jawaban
|
||||
answer_idx = np.argmax(answer_pred[0])
|
||||
|
||||
# Reverse word index untuk jawaban
|
||||
answer_word_index = answer_tokenizer.word_index
|
||||
answer_index_word = {v: k for k, v in answer_word_index.items()}
|
||||
|
||||
answer = answer_index_word.get(answer_idx, "<UNK>")
|
||||
|
||||
# Bentuk pertanyaan
|
||||
question = " ".join(question_words)
|
||||
|
||||
return question, answer
|
||||
|
||||
|
||||
# Contoh penggunaan
|
||||
# Catatan: Ini hanya contoh, perlu data aktual saat implementasi
|
||||
"""
|
||||
sample_context = "Selamat pagi, sekarang adalah hari Senin."
|
||||
sample_tokens = ["selamat", "pagi", "sekarang", "adalah", "hari", "senin"]
|
||||
sample_ner = ["O", "O", "O", "O", "O", "B-TIME"]
|
||||
sample_srl = ["B-V", "B-ARG1", "B-ARGM-TMP", "B-ARGM-PRD", "I-ARGM-PRD", "I-ARGM-PRD"]
|
||||
|
||||
# Load model yang sudah dilatih
|
||||
loaded_model = load_model("qa_generator_model_final.keras")
|
||||
|
||||
# Prediksi
|
||||
question, answer = predict_question_and_answer(
|
||||
loaded_model, sample_context, sample_tokens, sample_ner, sample_srl
|
||||
)
|
||||
|
||||
print("Konteks:", sample_context)
|
||||
print("Pertanyaan yang dihasilkan:", question)
|
||||
print("Jawaban yang dihasilkan:", answer)
|
||||
"""
|
||||
|
||||
sample = {
|
||||
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
"tokens": [
|
||||
"kerajaan",
|
||||
"majapahit",
|
||||
"berdiri",
|
||||
"pada",
|
||||
"tahun",
|
||||
"1293",
|
||||
"di",
|
||||
"trowulan",
|
||||
],
|
||||
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
|
||||
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
|
||||
}
|
||||
question, answer = predict_question_and_answer(
|
||||
model, sample["context"], sample["tokens"], sample["ner"], sample["srl"]
|
||||
)
|
||||
|
||||
print("Konteks:", sample["context"])
|
||||
print("Pertanyaan yang dihasilkan:", question)
|
||||
print("Jawaban yang dihasilkan:", answer)
|
||||
|
||||
# Plot history training
|
||||
# plt.figure(figsize=(12, 8))
|
||||
|
||||
# # Plot loss
|
||||
# plt.subplot(2, 2, 1)
|
||||
# plt.plot(history.history['loss'])
|
||||
# plt.plot(history.history['val_loss'])
|
||||
# plt.title('Model Loss')
|
||||
# plt.ylabel('Loss')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='upper right')
|
||||
|
||||
# # Plot question output accuracy
|
||||
# plt.subplot(2, 2, 2)
|
||||
# plt.plot(history.history['question_output_accuracy'])
|
||||
# plt.plot(history.history['val_question_output_accuracy'])
|
||||
# plt.title('Question Output Accuracy')
|
||||
# plt.ylabel('Accuracy')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='lower right')
|
||||
|
||||
# # Plot answer output accuracy
|
||||
# plt.subplot(2, 2, 3)
|
||||
# plt.plot(history.history['answer_output_accuracy'])
|
||||
# plt.plot(history.history['val_answer_output_accuracy'])
|
||||
# plt.title('Answer Output Accuracy')
|
||||
# plt.ylabel('Accuracy')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='lower right')
|
||||
|
||||
# plt.tight_layout()
|
||||
# plt.savefig("training_history.png")
|
||||
# plt.show()
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,615 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "58e41ccb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json, pickle, random\n",
|
||||
"from pathlib import Path\n",
|
||||
"from itertools import chain\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from tensorflow.keras.layers import (\n",
|
||||
" Input, Embedding, LSTM, Concatenate,\n",
|
||||
" Dense, TimeDistributed\n",
|
||||
")\n",
|
||||
"from tensorflow.keras.models import Model\n",
|
||||
"from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction\n",
|
||||
"from rouge_score import rouge_scorer, scoring\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "a94dd46a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"flattened samples : 8\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"RAW = json.loads(Path(\"../dataset/dev_dataset_qg.json\").read_text())\n",
|
||||
"\n",
|
||||
"samples = []\n",
|
||||
"for item in RAW:\n",
|
||||
" for qp in item[\"quiz_posibility\"]:\n",
|
||||
" samples.append({\n",
|
||||
" \"tokens\" : item[\"tokens\"],\n",
|
||||
" \"ner\" : item[\"ner\"],\n",
|
||||
" \"srl\" : item[\"srl\"],\n",
|
||||
" \"q_type\" : qp[\"type\"], # isian / opsi / benar_salah\n",
|
||||
" \"q_toks\" : qp[\"question\"] + [\"<eos>\"],\n",
|
||||
" \"a_toks\" : (qp[\"answer\"] if isinstance(qp[\"answer\"], list)\n",
|
||||
" else [qp[\"answer\"]]) + [\"<eos>\"]\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"print(\"flattened samples :\", len(samples))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "852fb9a8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_vocab(seq_iter, reserved=(\"<pad>\", \"<unk>\", \"<sos>\", \"<eos>\")):\n",
|
||||
" vocab = {tok: idx for idx, tok in enumerate(reserved)}\n",
|
||||
" for tok in chain.from_iterable(seq_iter):\n",
|
||||
" vocab.setdefault(tok, len(vocab))\n",
|
||||
" return vocab\n",
|
||||
"\n",
|
||||
"vocab_tok = build_vocab((s[\"tokens\"] for s in samples))\n",
|
||||
"vocab_ner = build_vocab((s[\"ner\"] for s in samples), reserved=(\"<pad>\",\"<unk>\"))\n",
|
||||
"vocab_srl = build_vocab((s[\"srl\"] for s in samples), reserved=(\"<pad>\",\"<unk>\"))\n",
|
||||
"vocab_q = build_vocab((s[\"q_toks\"] for s in samples))\n",
|
||||
"vocab_a = build_vocab((s[\"a_toks\"] for s in samples))\n",
|
||||
"vocab_typ = {\"isian\":0, \"opsi\":1, \"benar_salah\":2}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "fdf696cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def enc(seq, v): return [v.get(t, v[\"<unk>\"]) for t in seq]\n",
|
||||
"\n",
|
||||
"MAX_SENT = max(len(s[\"tokens\"]) for s in samples)\n",
|
||||
"MAX_Q = max(len(s[\"q_toks\"]) for s in samples)\n",
|
||||
"MAX_A = max(len(s[\"a_toks\"]) for s in samples)\n",
|
||||
"\n",
|
||||
"def pad_batch(seqs, vmap, maxlen):\n",
|
||||
" return tf.keras.preprocessing.sequence.pad_sequences(\n",
|
||||
" [enc(s, vmap) for s in seqs], maxlen=maxlen, padding=\"post\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"X_tok = pad_batch((s[\"tokens\"] for s in samples), vocab_tok, MAX_SENT)\n",
|
||||
"X_ner = pad_batch((s[\"ner\"] for s in samples), vocab_ner, MAX_SENT)\n",
|
||||
"X_srl = pad_batch((s[\"srl\"] for s in samples), vocab_srl, MAX_SENT)\n",
|
||||
"\n",
|
||||
"dec_q_in = pad_batch(\n",
|
||||
" ([[\"<sos>\"]+s[\"q_toks\"][:-1] for s in samples]), vocab_q, MAX_Q)\n",
|
||||
"dec_q_out = pad_batch((s[\"q_toks\"] for s in samples), vocab_q, MAX_Q)\n",
|
||||
"\n",
|
||||
"dec_a_in = pad_batch(\n",
|
||||
" ([[\"<sos>\"]+s[\"a_toks\"][:-1] for s in samples]), vocab_a, MAX_A)\n",
|
||||
"dec_a_out = pad_batch((s[\"a_toks\"] for s in samples), vocab_a, MAX_A)\n",
|
||||
"\n",
|
||||
"y_type = np.array([vocab_typ[s[\"q_type\"]] for s in samples])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "33074619",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_2\"</span>\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1mModel: \"functional_2\"\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ tok_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ not_equal_8 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ tok_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_ner (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">352</span> │ ner_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_srl (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">288</span> │ srl_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ expand_dims_4 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ not_equal_8[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">ExpandDims</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ broadcast_to_4 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ expand_dims_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BroadcastTo</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ones_like_2 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ emb_ner[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">OnesLike</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ones_like_3 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ emb_srl[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">OnesLike</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_tok (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">4,992</span> │ tok_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate_5 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">192</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ broadcast_to_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>) │ │ │ ones_like_2[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
||||
"│ │ │ │ ones_like_3[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ dec_q_in │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate_4 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">192</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ emb_tok[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>) │ │ │ emb_ner[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ │ │ │ emb_srl[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ any_2 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Any</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ concatenate_5[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ dec_a_in │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_q (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,968</span> │ dec_q_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ enc_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">459,776</span> │ concatenate_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
|
||||
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ any_2[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_a (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> │ dec_a_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm_q (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ emb_q[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>], │\n",
|
||||
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ not_equal_9 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ dec_q_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm_a (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ emb_a[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>], │\n",
|
||||
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ not_equal_10 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ dec_a_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ q_out │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">31</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">7,967</span> │ lstm_q[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ not_equal_9[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ a_out │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">14</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,598</span> │ lstm_a[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ not_equal_10[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ type_out (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">3</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">771</span> │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ tok_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ not_equal_8 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ tok_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_ner (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m) │ \u001b[38;5;34m352\u001b[0m │ ner_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_srl (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m) │ \u001b[38;5;34m288\u001b[0m │ srl_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ expand_dims_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ not_equal_8[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mExpandDims\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ broadcast_to_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ expand_dims_4[\u001b[38;5;34m0\u001b[0m]… │\n",
|
||||
"│ (\u001b[38;5;33mBroadcastTo\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ones_like_2 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ emb_ner[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mOnesLike\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ones_like_3 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ emb_srl[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mOnesLike\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_tok (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m4,992\u001b[0m │ tok_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate_5 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m192\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ broadcast_to_4[\u001b[38;5;34m0\u001b[0m… │\n",
|
||||
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ ones_like_2[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
|
||||
"│ │ │ │ ones_like_3[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ dec_q_in │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m192\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ emb_tok[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ emb_ner[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ │ │ │ emb_srl[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ any_2 (\u001b[38;5;33mAny\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ concatenate_5[\u001b[38;5;34m0\u001b[0m]… │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ dec_a_in │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_q (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m3,968\u001b[0m │ dec_q_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ enc_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m459,776\u001b[0m │ concatenate_4[\u001b[38;5;34m0\u001b[0m]… │\n",
|
||||
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ any_2[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ emb_a (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,792\u001b[0m │ dec_a_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm_q (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m394,240\u001b[0m │ emb_q[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m1\u001b[0m], │\n",
|
||||
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m2\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ not_equal_9 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ dec_q_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm_a (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m394,240\u001b[0m │ emb_a[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m1\u001b[0m], │\n",
|
||||
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m2\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ not_equal_10 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ dec_a_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ q_out │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m, \u001b[38;5;34m31\u001b[0m) │ \u001b[38;5;34m7,967\u001b[0m │ lstm_q[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ not_equal_9[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ a_out │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m, \u001b[38;5;34m14\u001b[0m) │ \u001b[38;5;34m3,598\u001b[0m │ lstm_a[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ not_equal_10[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ type_out (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m) │ \u001b[38;5;34m771\u001b[0m │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,271,984</span> (4.85 MB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m1,271,984\u001b[0m (4.85 MB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,271,984</span> (4.85 MB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m1,271,984\u001b[0m (4.85 MB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"d_tok, d_tag, units = 128, 32, 256\n",
|
||||
"pad_tok, pad_q, pad_a = vocab_tok[\"<pad>\"], vocab_q[\"<pad>\"], vocab_a[\"<pad>\"]\n",
|
||||
"\n",
|
||||
"# ---- Encoder ----------------------------------------------------\n",
|
||||
"inp_tok = Input((MAX_SENT,), name=\"tok_in\")\n",
|
||||
"inp_ner = Input((MAX_SENT,), name=\"ner_in\")\n",
|
||||
"inp_srl = Input((MAX_SENT,), name=\"srl_in\")\n",
|
||||
"\n",
|
||||
"emb_tok = Embedding(len(vocab_tok), d_tok, mask_zero=True, name=\"emb_tok\")(inp_tok)\n",
|
||||
"emb_ner = Embedding(len(vocab_ner), d_tag, mask_zero=False, name=\"emb_ner\")(inp_ner)\n",
|
||||
"emb_srl = Embedding(len(vocab_srl), d_tag, mask_zero=False, name=\"emb_srl\")(inp_srl)\n",
|
||||
"\n",
|
||||
"enc_concat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
|
||||
"enc_out, state_h, state_c = LSTM(units, return_state=True, name=\"enc_lstm\")(enc_concat)\n",
|
||||
"\n",
|
||||
"# ---- Decoder : Question ----------------------------------------\n",
|
||||
"dec_q_inp = Input((MAX_Q,), name=\"dec_q_in\")\n",
|
||||
"dec_emb_q = Embedding(len(vocab_q), d_tok, mask_zero=True, name=\"emb_q\")(dec_q_inp)\n",
|
||||
"dec_q_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,\n",
|
||||
" name=\"lstm_q\")(dec_emb_q, initial_state=[state_h, state_c])\n",
|
||||
"q_out = TimeDistributed(Dense(len(vocab_q), activation=\"softmax\"), name=\"q_out\")(dec_q_seq)\n",
|
||||
"\n",
|
||||
"# ---- Decoder : Answer ------------------------------------------\n",
|
||||
"dec_a_inp = Input((MAX_A,), name=\"dec_a_in\")\n",
|
||||
"dec_emb_a = Embedding(len(vocab_a), d_tok, mask_zero=True, name=\"emb_a\")(dec_a_inp)\n",
|
||||
"dec_a_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,\n",
|
||||
" name=\"lstm_a\")(dec_emb_a, initial_state=[state_h, state_c])\n",
|
||||
"a_out = TimeDistributed(Dense(len(vocab_a), activation=\"softmax\"), name=\"a_out\")(dec_a_seq)\n",
|
||||
"\n",
|
||||
"# ---- Classifier -------------------------------------------------\n",
|
||||
"type_out = Dense(len(vocab_typ), activation=\"softmax\", name=\"type_out\")(enc_out)\n",
|
||||
"\n",
|
||||
"model = Model(\n",
|
||||
" [inp_tok, inp_ner, inp_srl, dec_q_inp, dec_a_inp],\n",
|
||||
" [q_out, a_out, type_out]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# ---- Masked loss helpers ---------------------------------------\n",
|
||||
"scce = tf.keras.losses.SparseCategoricalCrossentropy(reduction=\"none\")\n",
|
||||
"def masked_loss_factory(pad_id):\n",
|
||||
" def loss(y_true, y_pred):\n",
|
||||
" l = scce(y_true, y_pred)\n",
|
||||
" mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)\n",
|
||||
" return tf.reduce_sum(l*mask) / tf.reduce_sum(mask)\n",
|
||||
" return loss\n",
|
||||
"\n",
|
||||
"model.compile(\n",
|
||||
" optimizer=\"adam\",\n",
|
||||
" loss = {\"q_out\":masked_loss_factory(pad_q),\n",
|
||||
" \"a_out\":masked_loss_factory(pad_a),\n",
|
||||
" \"type_out\":\"sparse_categorical_crossentropy\"},\n",
|
||||
" loss_weights={\"q_out\":1.0, \"a_out\":1.0, \"type_out\":0.3},\n",
|
||||
" metrics={\"q_out\":\"sparse_categorical_accuracy\",\n",
|
||||
" \"a_out\":\"sparse_categorical_accuracy\",\n",
|
||||
" \"type_out\":tf.keras.metrics.SparseCategoricalAccuracy(name=\"type_acc\")}\n",
|
||||
")\n",
|
||||
"model.summary()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "44d36899",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 1/30\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "Exception encountered when calling BroadcastTo.call().\n\n\u001b[1mFailed to convert elements of (None, 11, 128) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.\u001b[0m\n\nArguments received by BroadcastTo.call():\n • x=tf.Tensor(shape=(None, 11, 1), dtype=bool)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mX_tok\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_ner\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_srl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdec_q_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdec_a_in\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdec_q_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdec_a_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_type\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_split\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m30\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m64\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeras\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mEarlyStopping\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpatience\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrestore_best_weights\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\n\u001b[1;32m 9\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m model\u001b[38;5;241m.\u001b[39msave(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfull_seq2seq.keras\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# -----------------------------------------------------------------\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# 5. SAVE VOCABS (.pkl keeps python dict intact)\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# -----------------------------------------------------------------\u001b[39;00m\n",
|
||||
"File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 119\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
|
||||
"File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 119\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: Exception encountered when calling BroadcastTo.call().\n\n\u001b[1mFailed to convert elements of (None, 11, 128) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.\u001b[0m\n\nArguments received by BroadcastTo.call():\n • x=tf.Tensor(shape=(None, 11, 1), dtype=bool)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"history = model.fit(\n",
|
||||
" [X_tok, X_ner, X_srl, dec_q_in, dec_a_in],\n",
|
||||
" [dec_q_out, dec_a_out, y_type],\n",
|
||||
" validation_split=0.1,\n",
|
||||
" epochs=30,\n",
|
||||
" batch_size=64,\n",
|
||||
" callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],\n",
|
||||
" verbose=2\n",
|
||||
")\n",
|
||||
"model.save(\"full_seq2seq.keras\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# -----------------------------------------------------------------\n",
|
||||
"# 5. SAVE VOCABS (.pkl keeps python dict intact)\n",
|
||||
"# -----------------------------------------------------------------\n",
|
||||
"def save_vocab(v, name): pickle.dump(v, open(name,\"wb\"))\n",
|
||||
"save_vocab(vocab_tok,\"vocab_tok.pkl\"); save_vocab(vocab_ner,\"vocab_ner.pkl\")\n",
|
||||
"save_vocab(vocab_srl,\"vocab_srl.pkl\"); save_vocab(vocab_q, \"vocab_q.pkl\")\n",
|
||||
"save_vocab(vocab_a, \"vocab_a.pkl\"); save_vocab(vocab_typ,\"vocab_typ.pkl\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "61003de5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_inference_models(trained):\n",
|
||||
" # encoder\n",
|
||||
" t_in = Input((MAX_SENT,), name=\"t_in\")\n",
|
||||
" n_in = Input((MAX_SENT,), name=\"n_in\")\n",
|
||||
" s_in = Input((MAX_SENT,), name=\"s_in\")\n",
|
||||
" e_t = trained.get_layer(\"emb_tok\")(t_in)\n",
|
||||
" e_n = trained.get_layer(\"emb_ner\")(n_in)\n",
|
||||
" e_s = trained.get_layer(\"emb_srl\")(s_in)\n",
|
||||
" concat = Concatenate()([e_t,e_n,e_s])\n",
|
||||
" _, h, c = trained.get_layer(\"enc_lstm\")(concat)\n",
|
||||
" enc_model = Model([t_in,n_in,s_in],[h,c])\n",
|
||||
"\n",
|
||||
" # question‑decoder\n",
|
||||
" dq_in = Input((1,), name=\"dq_tok\")\n",
|
||||
" dh = Input((units,), name=\"dh\"); dc = Input((units,), name=\"dc\")\n",
|
||||
" dq_emb = trained.get_layer(\"emb_q\")(dq_in)\n",
|
||||
" dq_lstm, nh, nc = trained.get_layer(\"lstm_q\")(dq_emb, initial_state=[dh,dc])\n",
|
||||
" dq_out = trained.get_layer(\"q_out\").layer(dq_lstm)\n",
|
||||
" dec_q_model = Model([dq_in, dh, dc], [dq_out, nh, nc])\n",
|
||||
"\n",
|
||||
" # answer‑decoder\n",
|
||||
" da_in = Input((1,), name=\"da_tok\")\n",
|
||||
" ah = Input((units,), name=\"ah\"); ac = Input((units,), name=\"ac\")\n",
|
||||
" da_emb = trained.get_layer(\"emb_a\")(da_in)\n",
|
||||
" da_lstm, nh2, nc2 = trained.get_layer(\"lstm_a\")(da_emb, initial_state=[ah,ac])\n",
|
||||
" da_out = trained.get_layer(\"a_out\").layer(da_lstm)\n",
|
||||
" dec_a_model = Model([da_in, ah, ac], [da_out, nh2, nc2])\n",
|
||||
"\n",
|
||||
" # type classifier\n",
|
||||
" type_dense = trained.get_layer(\"type_out\")\n",
|
||||
" type_model = Model([t_in,n_in,s_in], type_dense(_)) # use _ = enc_lstm output\n",
|
||||
"\n",
|
||||
" return enc_model, dec_q_model, dec_a_model, type_model\n",
|
||||
"\n",
|
||||
"encoder_model, decoder_q, decoder_a, classifier_model = build_inference_models(model)\n",
|
||||
"\n",
|
||||
"inv_q = {v:k for k,v in vocab_q.items()}\n",
|
||||
"inv_a = {v:k for k,v in vocab_a.items()}\n",
|
||||
"\n",
|
||||
"def enc_pad(seq, vmap, maxlen):\n",
|
||||
" x = [vmap.get(t, vmap[\"<unk>\"]) for t in seq]\n",
|
||||
" return x + [vmap[\"<pad>\"]] * (maxlen-len(x))\n",
|
||||
"\n",
|
||||
"def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):\n",
|
||||
" et = np.array([enc_pad(tokens, vocab_tok, MAX_SENT)])\n",
|
||||
" en = np.array([enc_pad(ner, vocab_ner, MAX_SENT)])\n",
|
||||
" es = np.array([enc_pad(srl, vocab_srl, MAX_SENT)])\n",
|
||||
"\n",
|
||||
" h,c = encoder_model.predict([et,en,es], verbose=0)\n",
|
||||
"\n",
|
||||
" # --- question\n",
|
||||
" q_ids = []\n",
|
||||
" tgt = np.array([[vocab_q[\"<sos>\"]]])\n",
|
||||
" for _ in range(max_q):\n",
|
||||
" logits,h,c = decoder_q.predict([tgt,h,c], verbose=0)\n",
|
||||
" nxt = int(logits[0,-1].argmax())\n",
|
||||
" if nxt==vocab_q[\"<eos>\"]: break\n",
|
||||
" q_ids.append(nxt)\n",
|
||||
" tgt = np.array([[nxt]])\n",
|
||||
"\n",
|
||||
" # --- answer (re‑use fresh h,c)\n",
|
||||
" h,c = encoder_model.predict([et,en,es], verbose=0)\n",
|
||||
" a_ids = []\n",
|
||||
" tgt = np.array([[vocab_a[\"<sos>\"]]])\n",
|
||||
" for _ in range(max_a):\n",
|
||||
" logits,h,c = decoder_a.predict([tgt,h,c], verbose=0)\n",
|
||||
" nxt = int(logits[0,-1].argmax())\n",
|
||||
" if nxt==vocab_a[\"<eos>\"]: break\n",
|
||||
" a_ids.append(nxt)\n",
|
||||
" tgt = np.array([[nxt]])\n",
|
||||
"\n",
|
||||
" # --- type\n",
|
||||
" t_id = int(classifier_model.predict([et,en,es], verbose=0).argmax())\n",
|
||||
"\n",
|
||||
" return [inv_q[i] for i in q_ids], [inv_a[i] for i in a_ids], \\\n",
|
||||
" [k for k,v in vocab_typ.items() if v==t_id][0]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5279b631",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_tokens = [\"soekarno\",\"membacakan\",\"teks\",\"proklamasi\",\"pada\",\n",
|
||||
" \"17\",\"agustus\",\"1945\"]\n",
|
||||
"test_ner = [\"B-PER\",\"O\",\"O\",\"O\",\"O\",\"B-DATE\",\"I-DATE\",\"I-DATE\"]\n",
|
||||
"test_srl = [\"ARG0\",\"V\",\"ARG1\",\"ARG1\",\"O\",\"ARGM-TMP\",\"ARGM-TMP\",\"ARGM-TMP\"]\n",
|
||||
"\n",
|
||||
"q,a,t = greedy_decode(test_tokens,test_ner,test_srl,max_q=MAX_Q,max_a=MAX_A)\n",
|
||||
"print(\"\\nDEMO\\n----\")\n",
|
||||
"print(\"Q :\", \" \".join(q))\n",
|
||||
"print(\"A :\", \" \".join(a))\n",
|
||||
"print(\"T :\", t)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "850d4905",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"smooth = SmoothingFunction().method4\n",
|
||||
"r_scorer = rouge_scorer.RougeScorer([\"rouge1\",\"rougeL\"], use_stemmer=True)\n",
|
||||
"\n",
|
||||
"def strip_special(seq, pad_id, eos_id):\n",
|
||||
" return [x for x in seq if x not in (pad_id, eos_id)]\n",
|
||||
"\n",
|
||||
"def ids_to_text(ids, inv):\n",
|
||||
" return \" \".join(inv[i] for i in ids)\n",
|
||||
"\n",
|
||||
"def evaluate(n=200):\n",
|
||||
" idxs = random.sample(range(len(samples)), n)\n",
|
||||
" refs, hyps = [], []\n",
|
||||
" agg = scoring.BootstrapAggregator()\n",
|
||||
"\n",
|
||||
" for i in idxs:\n",
|
||||
" gt_ids = strip_special(dec_q_out[i], pad_q, vocab_q[\"<eos>\"])\n",
|
||||
" ref = ids_to_text(gt_ids, inv_q)\n",
|
||||
" pred = \" \".join(greedy_decode(\n",
|
||||
" samples[i][\"tokens\"],\n",
|
||||
" samples[i][\"ner\"],\n",
|
||||
" samples[i][\"srl\"]\n",
|
||||
" )[0])\n",
|
||||
" refs.append([ref.split()])\n",
|
||||
" hyps.append(pred.split())\n",
|
||||
" agg.add_scores(r_scorer.score(ref, pred))\n",
|
||||
"\n",
|
||||
" bleu = corpus_bleu(refs, hyps, smoothing_function=smooth)\n",
|
||||
" r1 = agg.aggregate()[\"rouge1\"].mid\n",
|
||||
" rL = agg.aggregate()[\"rougeL\"].mid\n",
|
||||
"\n",
|
||||
" print(f\"\\nEVAL (n={n})\")\n",
|
||||
" print(f\"BLEU‑4 : {bleu:.4f}\")\n",
|
||||
" print(f\"ROUGE‑1 : P={r1.precision:.3f} R={r1.recall:.3f} F1={r1.fmeasure:.3f}\")\n",
|
||||
" print(f\"ROUGE‑L : P={rL.precision:.3f} R={rL.recall:.3f} F1={rL.fmeasure:.3f}\")\n",
|
||||
"\n",
|
||||
"evaluate(2) "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "myenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,357 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Attention,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
import string
|
||||
from collections import Counter
|
||||
|
||||
# Load data
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model prediksi pertanyaan
|
||||
def prepare_question_prediction_data(data):
|
||||
"""Siapkan data untuk model prediksi pertanyaan"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
answers = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
q_types.append(qa["type"])
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, q_types
|
||||
|
||||
|
||||
# Siapkan data
|
||||
contexts, tokens_list, ner_list, srl_list, questions, q_types = (
|
||||
prepare_question_prediction_data(data)
|
||||
)
|
||||
|
||||
# Tokenizer untuk teks (context, question, answer)
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
|
||||
tokenizer.fit_on_texts(all_texts)
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Sequences
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
return (
|
||||
context_padded,
|
||||
token_padded,
|
||||
ner_padded,
|
||||
srl_padded,
|
||||
question_padded,
|
||||
)
|
||||
|
||||
|
||||
# Encode tipe pertanyaan
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, token_padded, ner_padded, srl_padded, question_padded = (
|
||||
pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
|
||||
)
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model prediksi pertanyaan
|
||||
def create_question_prediction_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
||||
context_lstm
|
||||
)
|
||||
|
||||
# Pool attention outputs
|
||||
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
||||
context_attention
|
||||
)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features
|
||||
all_features = Concatenate(name="all_features")(
|
||||
[context_att_pool, token_pool, q_type_input]
|
||||
)
|
||||
|
||||
# Dense layers with expanded capacity for sequence generation
|
||||
x = Dense(512, activation="relu", name="dense_1")(all_features)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
x = Dense(256, activation="relu", name="dense_2")(x)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
|
||||
# Reshape untuk sequence decoder
|
||||
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
|
||||
|
||||
# Many-to-many architecture for sequence generation
|
||||
# Decoder LSTM
|
||||
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
|
||||
|
||||
# Reshape untuk input ke decoder
|
||||
decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
|
||||
decoder_input_reshaped = tf.keras.layers.Reshape((1, lstm_units * 2))(decoder_input)
|
||||
|
||||
# Decoder sequence with teacher forcing
|
||||
# Expand dimensionality to match expected sequence length
|
||||
repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
|
||||
|
||||
# Process through decoder LSTM
|
||||
decoder_outputs = decoder_lstm(repeated_vector)
|
||||
|
||||
# Apply dense layer to each timestep
|
||||
question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
|
||||
decoder_outputs
|
||||
)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
q_type_input,
|
||||
],
|
||||
outputs=question_output_seq,
|
||||
)
|
||||
|
||||
# Compile model with categorical crossentropy for sequence prediction
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Buat model
|
||||
model = create_question_prediction_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"question_prediction_model.h5",
|
||||
monitor="val_accuracy",
|
||||
save_best_only=True,
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
|
||||
|
||||
# Reshaping question data for sequence-to-sequence training
|
||||
# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
|
||||
train_question_target = np.expand_dims(train_question, -1)
|
||||
test_question_target = np.expand_dims(test_question, -1)
|
||||
|
||||
# Training parameters
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_token, train_ner, train_srl, train_q_type],
|
||||
train_question_target,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_token, test_ner, test_srl, test_q_type],
|
||||
test_question_target,
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
# # Plot training history
|
||||
# plt.figure(figsize=(12, 4))
|
||||
# plt.subplot(1, 2, 1)
|
||||
# plt.plot(history.history['accuracy'])
|
||||
# plt.plot(history.history['val_accuracy'])
|
||||
# plt.title('Model Accuracy')
|
||||
# plt.ylabel('Accuracy')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='upper left')
|
||||
|
||||
# plt.subplot(1, 2, 2)
|
||||
# plt.plot(history.history['loss'])
|
||||
# plt.plot(history.history['val_loss'])
|
||||
# plt.title('Model Loss')
|
||||
# plt.ylabel('Loss')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='upper left')
|
||||
# plt.tight_layout()
|
||||
# plt.savefig('question_prediction_training_history.png')
|
||||
# plt.show()
|
||||
|
||||
# Simpan model dan tokenizer
|
||||
model.save("question_prediction_model_final.h5")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("question_prediction_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
|
|
@ -0,0 +1,473 @@
|
|||
import numpy as np
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
from rouge_score import rouge_scorer
|
||||
from nltk.translate.bleu_score import sentence_bleu
|
||||
|
||||
# Load data
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model prediksi pertanyaan
|
||||
def prepare_question_prediction_data(data):
|
||||
"""Siapkan data untuk model prediksi pertanyaan"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
q_types.append(qa["type"])
|
||||
# Tidak mengambil jawaban (answer) sebagai input
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, q_types
|
||||
|
||||
|
||||
# Siapkan data
|
||||
contexts, tokens_list, ner_list, srl_list, questions, q_types = (
|
||||
prepare_question_prediction_data(data)
|
||||
)
|
||||
|
||||
# Tokenizer untuk teks (context, question)
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
|
||||
tokenizer.fit_on_texts(all_texts)
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Sequences
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
return (
|
||||
context_padded,
|
||||
token_padded,
|
||||
ner_padded,
|
||||
srl_padded,
|
||||
question_padded,
|
||||
)
|
||||
|
||||
|
||||
# Encode tipe pertanyaan
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, token_padded, ner_padded, srl_padded, question_padded = (
|
||||
pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
|
||||
)
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model prediksi pertanyaan
|
||||
def create_question_prediction_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
# Apply attention to context LSTM
|
||||
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
||||
[context_lstm, context_lstm]
|
||||
)
|
||||
|
||||
# Pool attention outputs
|
||||
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
||||
context_attention
|
||||
)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features (tidak ada answer feature)
|
||||
all_features = Concatenate(name="all_features")(
|
||||
[context_att_pool, token_pool, q_type_input]
|
||||
)
|
||||
|
||||
# Dense layers with expanded capacity for sequence generation
|
||||
x = Dense(512, activation="relu", name="dense_1")(all_features)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
x = Dense(256, activation="relu", name="dense_2")(x)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
|
||||
# Reshape untuk sequence decoder
|
||||
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
|
||||
|
||||
# Many-to-many architecture for sequence generation
|
||||
# Decoder LSTM
|
||||
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
|
||||
|
||||
# Reshape untuk input ke decoder
|
||||
decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
|
||||
|
||||
# Decoder sequence with teacher forcing
|
||||
# Expand dimensionality to match expected sequence length
|
||||
repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
|
||||
|
||||
# Process through decoder LSTM
|
||||
decoder_outputs = decoder_lstm(repeated_vector)
|
||||
|
||||
# Apply dense layer to each timestep
|
||||
question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
|
||||
decoder_outputs
|
||||
)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
q_type_input,
|
||||
],
|
||||
outputs=question_output_seq,
|
||||
)
|
||||
|
||||
# Compile model with categorical crossentropy for sequence prediction
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Buat model
|
||||
model = create_question_prediction_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"question_prediction_model.h5",
|
||||
monitor="val_accuracy",
|
||||
save_best_only=True,
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
|
||||
|
||||
# Reshaping question data for sequence-to-sequence training
|
||||
# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
|
||||
train_question_target = np.expand_dims(train_question, -1)
|
||||
test_question_target = np.expand_dims(test_question, -1)
|
||||
|
||||
# Training parameters
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_token, train_ner, train_srl, train_q_type],
|
||||
train_question_target,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_token, test_ner, test_srl, test_q_type],
|
||||
test_question_target,
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
# Plot training history
|
||||
plt.figure(figsize=(12, 4))
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.plot(history.history["accuracy"])
|
||||
plt.plot(history.history["val_accuracy"])
|
||||
plt.title("Model Accuracy")
|
||||
plt.ylabel("Accuracy")
|
||||
plt.xlabel("Epoch")
|
||||
plt.legend(["Train", "Validation"], loc="upper left")
|
||||
|
||||
plt.subplot(1, 2, 2)
|
||||
plt.plot(history.history["loss"])
|
||||
plt.plot(history.history["val_loss"])
|
||||
plt.title("Model Loss")
|
||||
plt.ylabel("Loss")
|
||||
plt.xlabel("Epoch")
|
||||
plt.legend(["Train", "Validation"], loc="upper left")
|
||||
plt.tight_layout()
|
||||
plt.savefig("question_prediction_training_history.png")
|
||||
plt.show()
|
||||
|
||||
# Simpan model dan tokenizer
|
||||
model.save("question_prediction_model_final.h5")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("question_prediction_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
|
||||
|
||||
|
||||
# Fungsi untuk memprediksi pertanyaan
|
||||
def predict_question(context, tokens, ner, srl, q_type):
|
||||
context = preprocess_text(context)
|
||||
|
||||
context_seq = tokenizer.texts_to_sequences([context])[0]
|
||||
token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
context_padded = pad_sequences(
|
||||
[context_seq], maxlen=max_context_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences([token_seq], maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=max_token_len, padding="post")
|
||||
|
||||
# Q-type one-hot encoding
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_one_hot = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Predict
|
||||
pred = model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot],
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
# Convert prediction to words
|
||||
pred_seq = np.argmax(pred[0], axis=1)
|
||||
|
||||
# Convert indices to words
|
||||
reverse_word_map = {v: k for k, v in tokenizer.word_index.items()}
|
||||
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
|
||||
|
||||
return " ".join(pred_words)
|
||||
|
||||
|
||||
def evaluate_model_performance(test_data):
|
||||
|
||||
# Initialize ROUGE scorer
|
||||
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
|
||||
|
||||
# Lists to store scores
|
||||
bleu_scores = []
|
||||
rouge1_scores = []
|
||||
rouge2_scores = []
|
||||
rougel_scores = []
|
||||
|
||||
# Iterate through test data
|
||||
for i in range(len(test_data)):
|
||||
# Get test sample
|
||||
sample_context = contexts[test_data[i]]
|
||||
sample_tokens = tokens_list[test_data[i]]
|
||||
sample_ner = ner_list[test_data[i]]
|
||||
sample_srl = srl_list[test_data[i]]
|
||||
sample_q_type = q_types[test_data[i]]
|
||||
actual_question = questions[test_data[i]]
|
||||
|
||||
# Predict question
|
||||
pred_question = predict_question(
|
||||
sample_context, sample_tokens, sample_ner, sample_srl, sample_q_type
|
||||
)
|
||||
|
||||
# Tokenize for BLEU score
|
||||
actual_tokens = actual_question.split()
|
||||
pred_tokens = pred_question.split()
|
||||
|
||||
# Calculate BLEU score
|
||||
# Using unigram, bigram, trigram, and 4-gram
|
||||
print("kaliamt aktual", actual_tokens)
|
||||
print("kaliamt prediksi", pred_tokens)
|
||||
bleu_score = sentence_bleu([actual_tokens], pred_tokens)
|
||||
bleu_scores.append(bleu_score)
|
||||
|
||||
try:
|
||||
rouge_scores = scorer.score(actual_question, pred_question)
|
||||
|
||||
# Extract F1 scores
|
||||
rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
|
||||
rouge2_scores.append(rouge_scores["rouge2"].fmeasure)
|
||||
rougel_scores.append(rouge_scores["rougeL"].fmeasure)
|
||||
except Exception as e:
|
||||
print(f"Error calculating ROUGE score: {e}")
|
||||
|
||||
# Calculate average scores
|
||||
results = {
|
||||
"avg_bleu_score": np.mean(bleu_scores),
|
||||
"avg_rouge1": np.mean(rouge1_scores),
|
||||
"avg_rouge2": np.mean(rouge2_scores),
|
||||
"avg_rougel": np.mean(rougel_scores),
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
loaded_model = load_model("question_prediction_model_final.h5")
|
||||
|
||||
with open("question_prediction_tokenizers.json", "r") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
# Ambil beberapa sampel dari data test
|
||||
sample_idx = random.randint(0, len(test_indices) - 1)
|
||||
sample_context = contexts[test_indices[sample_idx]]
|
||||
sample_tokens = tokens_list[test_indices[sample_idx]]
|
||||
sample_ner = ner_list[test_indices[sample_idx]]
|
||||
sample_srl = srl_list[test_indices[sample_idx]]
|
||||
sample_q_type = q_types[test_indices[sample_idx]]
|
||||
|
||||
performance_metrics = evaluate_model_performance(test_indices)
|
||||
|
||||
print("\nModel Performance Metrics:")
|
||||
print(f"Average BLEU Score: {performance_metrics['avg_bleu_score']:.4f}")
|
||||
print(f"Average ROUGE-1 Score: {performance_metrics['avg_rouge1']:.4f}")
|
||||
print(f"Average ROUGE-2 Score: {performance_metrics['avg_rouge2']:.4f}")
|
||||
print(f"Average ROUGE-L Score: {performance_metrics['avg_rougel']:.4f}")
|
|
@ -0,0 +1,210 @@
|
|||
import numpy as np
|
||||
import json
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import load_model
|
||||
import re
|
||||
|
||||
class QuestionPredictionModel:
|
||||
def __init__(self, model_path, tokenizer_path):
|
||||
"""
|
||||
Initialize question prediction model with pre-trained model and tokenizers
|
||||
"""
|
||||
# Load model
|
||||
self.model = load_model(model_path)
|
||||
|
||||
# Load tokenizers
|
||||
with open(tokenizer_path, 'r') as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
# Reconstruct tokenizers
|
||||
self.word_tokenizer = tokenizer_from_json(tokenizer_data['word_tokenizer'])
|
||||
self.ner_tokenizer = tokenizer_from_json(tokenizer_data['ner_tokenizer'])
|
||||
self.srl_tokenizer = tokenizer_from_json(tokenizer_data['srl_tokenizer'])
|
||||
self.q_type_tokenizer = tokenizer_from_json(tokenizer_data['q_type_tokenizer'])
|
||||
|
||||
# Get max lengths
|
||||
self.max_context_len = tokenizer_data['max_context_len']
|
||||
self.max_answer_len = tokenizer_data['max_answer_len']
|
||||
self.max_question_len = tokenizer_data['max_question_len']
|
||||
self.max_token_len = tokenizer_data['max_token_len']
|
||||
|
||||
# Get vocabulary sizes
|
||||
self.vocab_size = len(self.word_tokenizer.word_index) + 1
|
||||
self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
|
||||
|
||||
def preprocess_text(self, text):
|
||||
"""Basic text preprocessing"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def predict_question(self, context, answer, tokens, ner, srl, q_type):
|
||||
"""
|
||||
Predict a question based on given context, answer, tokens, NER, SRL, and question type
|
||||
|
||||
Args:
|
||||
context (str): The context text
|
||||
answer (str): The answer to generate a question for
|
||||
tokens (list): List of tokens
|
||||
ner (list): List of NER tags corresponding to tokens
|
||||
srl (list): List of SRL tags corresponding to tokens
|
||||
q_type (str): Question type ('isian', 'opsi', or 'true_false')
|
||||
|
||||
Returns:
|
||||
str: The predicted question
|
||||
"""
|
||||
# Preprocess inputs
|
||||
context = self.preprocess_text(context)
|
||||
answer = self.preprocess_text(answer)
|
||||
|
||||
# Convert to sequences
|
||||
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
|
||||
answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0]
|
||||
tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
# Pad sequences
|
||||
context_padded = pad_sequences([context_seq], maxlen=self.max_context_len, padding="post")
|
||||
answer_padded = pad_sequences([answer_seq], maxlen=self.max_answer_len, padding="post")
|
||||
tokens_padded = pad_sequences([tokens_seq], maxlen=self.max_token_len, padding="post")
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
|
||||
|
||||
# One-hot encode question type
|
||||
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=self.q_type_vocab_size
|
||||
)
|
||||
|
||||
# Make prediction
|
||||
predicted_seq = self.model.predict(
|
||||
[context_padded, answer_padded, tokens_padded, ner_padded, srl_padded, q_type_categorical]
|
||||
)
|
||||
|
||||
# Convert predictions to tokens (taking the highest probability token at each position)
|
||||
predicted_indices = np.argmax(predicted_seq[0], axis=1)
|
||||
|
||||
# Create reversed word index for converting indices back to words
|
||||
reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()}
|
||||
|
||||
# Convert indices to words
|
||||
predicted_words = []
|
||||
for idx in predicted_indices:
|
||||
if idx != 0: # Skip padding tokens
|
||||
predicted_words.append(reverse_word_index.get(idx, ''))
|
||||
|
||||
# Form the question
|
||||
predicted_question = ' '.join(predicted_words)
|
||||
|
||||
# Add "___" to the end based on question type convention
|
||||
if "___" not in predicted_question:
|
||||
predicted_question += " ___"
|
||||
|
||||
return predicted_question
|
||||
|
||||
def batch_predict_questions(self, data):
|
||||
"""
|
||||
Predict questions for a batch of data
|
||||
|
||||
Args:
|
||||
data (list): List of dictionaries with context, tokens, ner, srl, and answers
|
||||
|
||||
Returns:
|
||||
list: List of predicted questions
|
||||
"""
|
||||
results = []
|
||||
|
||||
for item in data:
|
||||
context = item["context"]
|
||||
tokens = item["tokens"]
|
||||
ner = item["ner"]
|
||||
srl = item["srl"]
|
||||
|
||||
# If there are Q&A pairs, use them for evaluation
|
||||
if "qas" in item:
|
||||
for qa in item["qas"]:
|
||||
answer = qa["answer"]
|
||||
q_type = qa["type"]
|
||||
ground_truth = qa["question"]
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, answer, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append({
|
||||
"context": context,
|
||||
"answer": answer,
|
||||
"predicted_question": predicted_question,
|
||||
"ground_truth": ground_truth,
|
||||
"question_type": q_type
|
||||
})
|
||||
else:
|
||||
# If no Q&A pairs, generate questions for all question types
|
||||
for q_type in ["isian", "true_false", "opsi"]:
|
||||
# For demo purposes, use a placeholder answer (would need actual answers in real use)
|
||||
# In practice, you might extract potential answers from the context
|
||||
placeholders = {
|
||||
"isian": "placeholder",
|
||||
"true_false": "true",
|
||||
"opsi": "placeholder"
|
||||
}
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, placeholders[q_type], tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append({
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"question_type": q_type
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Load test data
|
||||
with open("data_converted.json", "r") as f:
|
||||
test_data = json.load(f)
|
||||
|
||||
# Initialize model
|
||||
question_predictor = QuestionPredictionModel(
|
||||
model_path="question_prediction_model_final.h5",
|
||||
tokenizer_path="question_prediction_tokenizers.json"
|
||||
)
|
||||
|
||||
# Example single prediction
|
||||
sample = test_data[0]
|
||||
context = sample["context"]
|
||||
tokens = sample["tokens"]
|
||||
ner = sample["ner"]
|
||||
srl = sample["srl"]
|
||||
answer = sample["qas"][0]["answer"]
|
||||
q_type = sample["qas"][0]["type"]
|
||||
|
||||
predicted_question = question_predictor.predict_question(
|
||||
context, answer, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
print(f"Context: {context}")
|
||||
print(f"Answer: {answer}")
|
||||
print(f"Question Type: {q_type}")
|
||||
print(f"Predicted Question: {predicted_question}")
|
||||
print(f"Ground Truth: {sample['qas'][0]['question']}")
|
||||
|
||||
# Batch prediction
|
||||
results = question_predictor.batch_predict_questions(test_data[:3])
|
||||
|
||||
print("\nBatch Results:")
|
||||
for i, result in enumerate(results):
|
||||
print(f"\nResult {i+1}:")
|
||||
print(f"Context: {result['context']}")
|
||||
print(f"Answer: {result.get('answer', 'N/A')}")
|
||||
print(f"Question Type: {result['question_type']}")
|
||||
print(f"Predicted Question: {result['predicted_question']}")
|
||||
if 'ground_truth' in result:
|
||||
print(f"Ground Truth: {result['ground_truth']}")
|
|
@ -0,0 +1,188 @@
|
|||
import numpy as np
|
||||
import json
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import load_model
|
||||
import re
|
||||
|
||||
|
||||
class QuestionPredictionModel:
|
||||
def __init__(self, model_path, tokenizer_path):
|
||||
"""
|
||||
Initialize question prediction model with pre-trained model and tokenizers
|
||||
"""
|
||||
# Load model
|
||||
self.model = load_model(model_path)
|
||||
|
||||
# Load tokenizers
|
||||
with open(tokenizer_path, "r") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
# Reconstruct tokenizers
|
||||
self.word_tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
|
||||
self.ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
|
||||
self.srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
|
||||
self.q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
|
||||
|
||||
# Get max lengths
|
||||
self.max_context_len = tokenizer_data["max_context_len"]
|
||||
self.max_question_len = tokenizer_data["max_question_len"]
|
||||
self.max_token_len = tokenizer_data["max_token_len"]
|
||||
|
||||
# Get vocabulary sizes
|
||||
self.vocab_size = len(self.word_tokenizer.word_index) + 1
|
||||
self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
|
||||
|
||||
def preprocess_text(self, text):
|
||||
"""Basic text preprocessing"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def predict_question(self, context, tokens, ner, srl, q_type):
|
||||
"""Prediksi pertanyaan berdasarkan konteks dan fitur lainnya"""
|
||||
# Preprocess
|
||||
context = self.preprocess_text(context)
|
||||
|
||||
# Convert to sequences
|
||||
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
|
||||
token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
# Pad sequences
|
||||
context_padded = pad_sequences(
|
||||
[context_seq], maxlen=self.max_context_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences(
|
||||
[token_seq], maxlen=self.max_token_len, padding="post"
|
||||
)
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
|
||||
|
||||
# Q-type one-hot encoding
|
||||
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_one_hot = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=self.q_type_vocab_size
|
||||
)
|
||||
|
||||
# Predict
|
||||
pred = self.model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot]
|
||||
)
|
||||
|
||||
# Convert prediction to words
|
||||
pred_seq = np.argmax(pred[0], axis=1)
|
||||
|
||||
# Convert indices to words
|
||||
reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()}
|
||||
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
|
||||
|
||||
return " ".join(pred_words)
|
||||
|
||||
def batch_predict_questions(self, data):
|
||||
"""
|
||||
Predict questions for a batch of data
|
||||
|
||||
Args:
|
||||
data (list): List of dictionaries with context, tokens, ner, srl, and answers
|
||||
|
||||
Returns:
|
||||
list: List of predicted questions
|
||||
"""
|
||||
results = []
|
||||
|
||||
for item in data:
|
||||
context = item["context"]
|
||||
tokens = item["tokens"]
|
||||
ner = item["ner"]
|
||||
srl = item["srl"]
|
||||
|
||||
# If there are Q&A pairs, use them for evaluation
|
||||
if "qas" in item:
|
||||
for qa in item["qas"]:
|
||||
q_type = qa["type"]
|
||||
ground_truth = qa["question"]
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"ground_truth": ground_truth,
|
||||
"question_type": q_type,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If no Q&A pairs, generate questions for all question types
|
||||
for q_type in ["isian", "true_false", "opsi"]:
|
||||
# For demo purposes, use a placeholder answer (would need actual answers in real use)
|
||||
# In practice, you might extract potential answers from the context
|
||||
placeholders = {
|
||||
"isian": "placeholder",
|
||||
"true_false": "true",
|
||||
"opsi": "placeholder",
|
||||
}
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, placeholders[q_type], tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"question_type": q_type,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Load test data
|
||||
with open("data_converted.json", "r") as f:
|
||||
test_data = json.load(f)
|
||||
|
||||
# Initialize model
|
||||
question_predictor = QuestionPredictionModel(
|
||||
model_path="question_prediction_model_final.h5",
|
||||
tokenizer_path="question_prediction_tokenizers.json",
|
||||
)
|
||||
|
||||
# Example single prediction
|
||||
sample = test_data[0]
|
||||
context = sample["context"]
|
||||
tokens = sample["tokens"]
|
||||
ner = sample["ner"]
|
||||
srl = sample["srl"]
|
||||
answer = sample["qas"][0]["answer"]
|
||||
q_type = sample["qas"][0]["type"]
|
||||
|
||||
predicted_question = question_predictor.predict_question(
|
||||
context, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
print(f"Context: {context}")
|
||||
print(f"Answer: {answer}")
|
||||
print(f"Question Type: {q_type}")
|
||||
print(f"Predicted Question: {predicted_question}")
|
||||
print(f"Ground Truth: {sample['qas'][0]['question']}")
|
||||
|
||||
# Batch prediction
|
||||
# results = question_predictor.batch_predict_questions(test_data[:3])
|
||||
|
||||
# print("\nBatch Results:")
|
||||
# for i, result in enumerate(results):
|
||||
# print(f"\nResult {i+1}:")
|
||||
# print(f"Context: {result['context']}")
|
||||
# print(f"Answer: {result.get('answer', 'N/A')}")
|
||||
# print(f"Question Type: {result['question_type']}")
|
||||
# print(f"Predicted Question: {result['predicted_question']}")
|
||||
# if "ground_truth" in result:
|
||||
# print(f"Ground Truth: {result['ground_truth']}")
|
File diff suppressed because one or more lines are too long
Binary file not shown.
After Width: | Height: | Size: 50 KiB |
Binary file not shown.
After Width: | Height: | Size: 88 KiB |
Loading…
Reference in New Issue