feat: adding new model training

2025-05-14 23:07:52 +07:00 · 2025-05-14 23:07:52 +07:00 · f0f6f412bb
parent ad4b6d6137
commit f0f6f412bb
23 changed files with 42320 additions and 739 deletions
--- a/question_generation/answer_model.py
+++ b/question_generation/answer_model.py
@ -0,0 +1,424 @@
+import numpy as np
+import pandas as pd
+import json
+import random
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Model, load_model
+from tensorflow.keras.layers import (
+    Input,
+    LSTM,
+    Dense,
+    Embedding,
+    Bidirectional,
+    Concatenate,
+    Attention,
+    Dropout,
+)
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+import re
+import string
+from collections import Counter
+
+# Data contoh yang diberikan
+# data = [
+#   {
+#     "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
+#     "tokens": [
+#       "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara"
+#     ],
+#     "ner": [
+#       "PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"
+#     ],
+#     "srl": [
+#       "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
+#     ],
+#     "qas": [
+#       {
+#         "type": "isian",
+#         "question": "Dimana kartini lahir ___",
+#         "answer": "jepara",
+#         "id": "qa_0_q1"
+#       },
+#       {
+#         "type": "true_false",
+#         "question": "Kartini lahir pada tanggal 21 mei 1879 ___",
+#         "options": ["true", "false"],
+#         "answer": "false",
+#         "id": "qa_0_q2"
+#       }
+#     ]
+#   },
+#   {
+#     "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
+#     "tokens": [
+#       "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan"
+#     ],
+#     "ner": [
+#       "O", "ORG", "O", "O", "O", "DATE", "O", "LOC"
+#     ],
+#     "srl": [
+#       "ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"
+#     ],
+#     "qas": [
+#       {
+#         "type": "opsi",
+#         "question": "Dimana kerajaan majapahit berdiri ___",
+#         "options": ["trowulan", "singasari", "kuta", "banten"],
+#         "answer": "trowulan",
+#         "id": "qa_1_q1"
+#       },
+#       {
+#         "type": "true_false",
+#         "question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
+#         "options": ["true", "false"],
+#         "answer": "false",
+#         "id": "qa_1_q2"
+#       }
+#     ]
+#   },
+#   {
+#     "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
+#     "tokens": [
+#       "soekarno", "dan", "mohammad", "hatta", "memproklamasikan", "kemerdekaan", "indonesia", "pada", "17", "agustus", "1945"
+#     ],
+#     "ner": [
+#       "PER", "O", "PER", "PER", "O", "O", "LOC", "O", "DATE", "DATE", "DATE"
+#     ],
+#     "srl": [
+#       "ARG0", "O", "ARG0", "ARG0", "V", "ARG1", "ARGM-LOC", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP"
+#     ],
+#     "qas": [
+#       {
+#         "type": "isian",
+#         "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
+#         "answer": "17 agustus 1945",
+#         "id": "qa_2_q1"
+#       },
+#       {
+#         "type": "opsi",
+#         "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
+#         "options": ["soekarno", "mohammad hatta", "sudirman", "ahmad yani"],
+#         "answer": "soekarno mohammad hatta",
+#         "id": "qa_2_q2"
+#       }
+#     ]
+#   }
+# ]
+
+with open("data_converted.json", "r") as f:
+    data = json.load(f)
+    
+
+
+
+# # Simpan ke file JSON untuk kebutuhan di masa depan
+# with read('qa_dataset.json', 'w', encoding='utf-8') as f:
+#     json.dump(data, f, ensure_ascii=False, indent=2)
+
+
+# Preprocessing function
+def preprocess_text(text):
+    """Melakukan preprocessing teks dasar"""
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+# Persiapkan data untuk model
+def prepare_data(data):
+    """Siapkan data untuk model"""
+    contexts = []
+    tokens_list = []
+    ner_list = []
+    srl_list = []
+    questions = []
+    answers = []
+    q_types = []
+
+    for item in data:
+        for qa in item["qas"]:
+            contexts.append(preprocess_text(item["context"]))
+            tokens_list.append(item["tokens"])
+            ner_list.append(item["ner"])
+            srl_list.append(item["srl"])
+            questions.append(preprocess_text(qa["question"]))
+            answers.append(qa["answer"])
+            q_types.append(qa["type"])
+
+    return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
+
+
+# Siapkan data
+contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
+    data
+)
+
+# Tokenizer untuk teks (context dan question)
+max_vocab_size = 10000
+tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
+tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
+vocab_size = len(tokenizer.word_index) + 1
+
+# Encoding untuk NER
+ner_tokenizer = Tokenizer(oov_token="<OOV>")
+ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
+ner_vocab_size = len(ner_tokenizer.word_index) + 1
+
+# Encoding untuk SRL
+srl_tokenizer = Tokenizer(oov_token="<OOV>")
+srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
+srl_vocab_size = len(srl_tokenizer.word_index) + 1
+
+# Encoding untuk tipe pertanyaan
+q_type_tokenizer = Tokenizer()
+q_type_tokenizer.fit_on_texts(q_types)
+q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
+
+
+# Konversi token, ner, srl ke sequences
+def tokens_to_sequences(tokens, ner, srl):
+    """Konversi token, ner, dan srl ke sequences"""
+    token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
+    ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
+    srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
+    return token_seqs, ner_seqs, srl_seqs
+
+
+# Menentukan panjang maksimum untuk padding
+context_seqs = tokenizer.texts_to_sequences(contexts)
+question_seqs = tokenizer.texts_to_sequences(questions)
+token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
+
+max_context_len = max([len(seq) for seq in context_seqs])
+max_question_len = max([len(seq) for seq in question_seqs])
+max_token_len = max([len(seq) for seq in token_seqs])
+
+
+# Pad sequences untuk memastikan semua input sama panjang
+def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
+    """Padding semua sequences"""
+    context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
+    question_padded = pad_sequences(
+        question_seqs, maxlen=max_question_len, padding="post"
+    )
+    token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
+    ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
+    srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
+    return context_padded, question_padded, token_padded, ner_padded, srl_padded
+
+
+# Siapkan encoder untuk jawaban
+answer_tokenizer = Tokenizer(oov_token="<OOV>")
+answer_tokenizer.fit_on_texts(answers)
+answer_vocab_size = len(answer_tokenizer.word_index) + 1
+
+# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
+q_type_indices = []
+for q_type in q_types:
+    # Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
+    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
+    q_type_indices.append(q_type_idx)
+
+# Konversi ke numpy array
+q_type_indices = np.array(q_type_indices)
+
+# One-hot encode tipe pertanyaan
+q_type_categorical = tf.keras.utils.to_categorical(
+    q_type_indices, num_classes=q_type_vocab_size
+)
+
+# Pad sequences
+context_padded, question_padded, token_padded, ner_padded, srl_padded = (
+    pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
+)
+
+# Encode jawaban
+answer_seqs = answer_tokenizer.texts_to_sequences(answers)
+max_answer_len = max([len(seq) for seq in answer_seqs])
+answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
+
+# Split data menjadi train dan test sets
+indices = list(range(len(context_padded)))
+train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
+
+
+# Fungsi untuk mendapatkan subset dari data berdasarkan indices
+def get_subset(data, indices):
+    return np.array([data[i] for i in indices])
+
+
+# Train data
+train_context = get_subset(context_padded, train_indices)
+train_question = get_subset(question_padded, train_indices)
+train_token = get_subset(token_padded, train_indices)
+train_ner = get_subset(ner_padded, train_indices)
+train_srl = get_subset(srl_padded, train_indices)
+train_q_type = get_subset(q_type_categorical, train_indices)
+train_answer = get_subset(answer_padded, train_indices)
+
+# Test data
+test_context = get_subset(context_padded, test_indices)
+test_question = get_subset(question_padded, test_indices)
+test_token = get_subset(token_padded, test_indices)
+test_ner = get_subset(ner_padded, test_indices)
+test_srl = get_subset(srl_padded, test_indices)
+test_q_type = get_subset(q_type_categorical, test_indices)
+test_answer = get_subset(answer_padded, test_indices)
+
+# Hyperparameters
+embedding_dim = 100
+lstm_units = 128
+ner_embedding_dim = 50
+srl_embedding_dim = 50
+dropout_rate = 0.3
+
+
+# Function untuk membuat model
+def create_qa_model():
+    # Input layers
+    context_input = Input(shape=(max_context_len,), name="context_input")
+    question_input = Input(shape=(max_question_len,), name="question_input")
+    token_input = Input(shape=(max_token_len,), name="token_input")
+    ner_input = Input(shape=(max_token_len,), name="ner_input")
+    srl_input = Input(shape=(max_token_len,), name="srl_input")
+    q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
+
+    # Shared embedding layer for text
+    text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
+
+    # Embedding untuk NER dan SRL
+    ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
+        ner_input
+    )
+    srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
+        srl_input
+    )
+
+    # Apply embeddings
+    context_embed = text_embedding(context_input)
+    question_embed = text_embedding(question_input)
+    token_embed = text_embedding(token_input)
+
+    # Bi-directional LSTM untuk context dan token-level features
+    context_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="context_lstm")
+    )(context_embed)
+    question_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="question_lstm")
+    )(question_embed)
+
+    # Concat token features (tokens, NER, SRL)
+    token_features = Concatenate(name="token_features")(
+        [token_embed, ner_embedding, srl_embedding]
+    )
+    token_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="token_lstm")
+    )(token_features)
+
+    # Attention mechanism untuk context dengan memperhatikan question
+    context_attention = tf.keras.layers.Attention(name="context_attention")(
+        [context_lstm, question_lstm]
+    )
+
+    # Pool attention outputs
+    context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
+        context_attention
+    )
+    question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")(
+        question_lstm
+    )
+    token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
+
+    # Concat all features
+    all_features = Concatenate(name="all_features")(
+        [context_att_pool, question_pool, token_pool, q_type_input]
+    )
+
+    # Dense layers
+    x = Dense(256, activation="relu", name="dense_1")(all_features)
+    x = Dropout(dropout_rate)(x)
+    x = Dense(128, activation="relu", name="dense_2")(x)
+    x = Dropout(dropout_rate)(x)
+
+    # Output layer untuk jawaban
+    answer_output = Dense(
+        answer_vocab_size, activation="softmax", name="answer_output"
+    )(x)
+
+    # Create model
+    model = Model(
+        inputs=[
+            context_input,
+            question_input,
+            token_input,
+            ner_input,
+            srl_input,
+            q_type_input,
+        ],
+        outputs=answer_output,
+    )
+
+    # Compile model
+    model.compile(
+        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+    )
+
+    return model
+
+
+# Buat model
+model = create_qa_model()
+model.summary()
+
+# Callback untuk menyimpan model terbaik
+checkpoint = ModelCheckpoint(
+    "qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1
+)
+
+early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
+
+# Training
+batch_size = 8
+epochs = 50
+
+# Ubah format jawaban untuk sparse categorical crossentropy
+train_answer_labels = train_answer[:, 0]  # Ambil indeks pertama dari jawaban
+test_answer_labels = test_answer[:, 0]
+
+# Train model
+history = model.fit(
+    [train_context, train_question, train_token, train_ner, train_srl, train_q_type],
+    train_answer_labels,
+    batch_size=batch_size,
+    epochs=epochs,
+    validation_data=(
+        [test_context, test_question, test_token, test_ner, test_srl, test_q_type],
+        test_answer_labels,
+    ),
+    callbacks=[checkpoint, early_stop],
+)
+
+
+# Simpan model dan tokenizer
+model.save("qa_lstm_model_final.h5")
+
+# Simpan tokenizer
+tokenizer_data = {
+    "word_tokenizer": tokenizer.to_json(),
+    "ner_tokenizer": ner_tokenizer.to_json(),
+    "srl_tokenizer": srl_tokenizer.to_json(),
+    "answer_tokenizer": answer_tokenizer.to_json(),
+    "q_type_tokenizer": q_type_tokenizer.to_json(),
+    "max_context_len": max_context_len,
+    "max_question_len": max_question_len,
+    "max_token_len": max_token_len,
+}
+
+with open("qa_tokenizers.json", "w") as f:
+    json.dump(tokenizer_data, f)
+
+print("Model dan tokenizer berhasil disimpan!")
--- a/question_generation/answer_predict.py
+++ b/question_generation/answer_predict.py
@ -0,0 +1,151 @@
+import json
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
+import re
+import random
+
+# Load tokenizers and model configurations
+with open("qa_tokenizers.json", "r") as f:
+    tokenizer_data = json.load(f)
+
+tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
+ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
+srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
+answer_tokenizer = tokenizer_from_json(tokenizer_data["answer_tokenizer"])
+q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
+
+max_context_len = tokenizer_data["max_context_len"]
+max_question_len = tokenizer_data["max_question_len"]
+max_token_len = tokenizer_data["max_token_len"]
+q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
+
+# Load trained model
+model = load_model("qa_lstm_model_final.h5")
+
+def preprocess_text(text):
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+def predict_answer(context, question, tokens, ner, srl, q_type):
+    context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
+    question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
+    token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
+    ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
+    srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
+    
+    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
+    q_type_cat = tf.keras.utils.to_categorical([q_type_idx], num_classes=q_type_vocab_size)
+
+    # Pad sequences
+    context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
+    question_pad = pad_sequences(question_seq, maxlen=max_question_len, padding="post")
+    token_pad = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
+    ner_pad = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
+    srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
+
+    # Predict
+    prediction = model.predict([context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0)
+    answer_idx = np.argmax(prediction[0])
+
+    # Retrieve predicted answer word
+    for word, idx in answer_tokenizer.word_index.items():
+        if idx == answer_idx:
+            return word
+
+    return "Unknown"
+
+def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
+    entities = {}
+    predicate = ""
+
+    for i, token in enumerate(tokens):
+        if ner[i] != "O":
+            entities.setdefault(ner[i], []).append(token)
+        if srl[i] == "V":
+            predicate = token
+        elif srl[i].startswith("ARG"):
+            entities.setdefault(srl[i], []).append(token)
+
+    subject = " ".join(entities.get("ARG0", [""]))
+
+    if question_type == "isian":
+        if "LOC" in entities:
+            location = " ".join(entities["LOC"])
+            return f"Dimana {subject} {predicate} ___", location
+        elif "DATE" in entities:
+            date = " ".join(entities["DATE"])
+            return f"Kapan {subject} {predicate} ___", date
+
+    elif question_type == "true_false":
+        if "DATE" in entities:
+            original_date = " ".join(entities["DATE"])
+            try:
+                modified_year = str(int(entities['DATE'][-1]) + random.randint(1, 5))
+                modified_date = f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
+            except:
+                modified_date = original_date  # Fallback if parsing fails
+            return f"{subject} {predicate} pada {modified_date} ___", "false"
+
+    elif question_type == "opsi":
+        if "LOC" in entities:
+            correct_location = " ".join(entities["LOC"])
+            distractors = ["singasari", "kuta", "banten", "kediri", "makassar"]
+            distractors = [d for d in distractors if d != correct_location]
+            options = random.sample(distractors, 3) + [correct_location]
+            random.shuffle(options)
+            return f"Dimana {subject} {predicate} ___", options, correct_location
+
+    return "Apa yang terjadi dalam teks ini ___", context
+
+# ✅ Example Usage with Random Sampling
+if __name__ == "__main__":
+    with open("data_converted.json", "r") as f:
+        data = json.load(f)
+
+    # Randomly select an example for testing
+    test_item = random.choice(data)
+    test_qa = random.choice(test_item["qas"])
+
+    predicted_answer = predict_answer(
+        test_item["context"],
+        test_qa["question"],
+        test_item["tokens"],
+        test_item["ner"],
+        test_item["srl"],
+        test_qa["type"]
+    )
+
+    print(f"Context: {test_item['context']}")
+    print(f"Question: {test_qa['question']}")
+    print(f"True Answer: {test_qa['answer']}")
+    print(f"Predicted Answer: {predicted_answer}")
+
+    # Generate Random Question Example
+    example_context = test_item["context"]
+    example_tokens = test_item["tokens"]
+    example_ner = test_item["ner"]
+    example_srl = test_item["srl"]
+
+    random_question_type = random.choice(["isian", "true_false", "opsi"])
+
+    result = generate_question_answer(
+        example_context, example_tokens, example_ner, example_srl, random_question_type
+    )
+
+    print("\nGenerated Question Example:")
+    print(f"Context: {example_context}")
+    print(f"Question Type: {random_question_type}")
+
+    if random_question_type == "opsi":
+        question, options, correct_answer = result
+        print(f"Generated Question: {question}")
+        print(f"Options: {options}")
+        print(f"Correct Answer: {correct_answer}")
+    else:
+        question, answer = result
+        print(f"Generated Question: {question}")
+        print(f"Answer: {answer}")
--- a/question_generation/cnv_dts.py
+++ b/question_generation/cnv_dts.py
@ -0,0 +1,54 @@
+import json
+import re
+from collections import OrderedDict
+
+def normalize_question(text):
+    text = re.sub(r'\s+([?.!,])', r'\1', text)
+    return text.capitalize()
+
+# Load data
+with open('../dataset/dev_dataset_qg.json', 'r', encoding='utf-8') as file:
+    data = json.load(file)
+
+processed_data = []
+
+for idx_entry, entry in enumerate(data):
+    if not isinstance(entry, dict):
+        continue
+
+    if "context" not in entry:
+        entry["context"] = " ".join(entry.get("tokens", []))
+
+    # Update NER tags: ubah 'V' menjadi 'O'
+    ner_tags = entry.get("ner", [])
+    entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]
+
+    for idx_qa, qa in enumerate(entry.get("qas", [])):
+        if "id" not in qa:
+            qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"
+
+        answer = qa.get("answer")
+        if isinstance(answer, list):
+            qa["answer"] = " ".join(answer)
+
+        question = qa.get("question")
+        if isinstance(question, list):
+            question_str = " ".join(question)
+            qa["question"] = normalize_question(question_str)
+
+    # Reorder fields: tokens first, then the rest
+    ordered_entry = OrderedDict()
+    if "context" in entry:
+        ordered_entry["context"] = entry.pop("context")
+    # Add remaining fields in their original order
+    for key, value in entry.items():
+        ordered_entry[key] = value
+
+    processed_data.append(ordered_entry)
+
+# Save result
+with open('data_converted.json', 'w', encoding='utf-8') as file:
+    json.dump(processed_data, file, indent=2, ensure_ascii=False)
+
+# Optional: Print first 2 entries for quick verification
+print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))
--- a/question_generation/data.json
+++ b/question_generation/data.json
@ -0,0 +1,53 @@
+[
+  {
+    "context": "Raden Ajeng Kartini lahir pada 21 April 1879 di Jepara.",
+    "tokens": [
+      "raden",
+      "ajeng",
+      "kartini",
+      "lahir",
+      "pada",
+      "21",
+      "april",
+      "1879",
+      "di",
+      "jepara"
+    ],
+    "ner_tags": [
+      "PER",
+      "PER",
+      "PER",
+      "V",
+      "O",
+      "DATE",
+      "DATE",
+      "DATE",
+      "O",
+      "LOC"
+    ],
+    "srl_tags": [
+      "ARG0",
+      "ARG0",
+      "ARG0",
+      "V",
+      "O",
+      "ARGM-TMP",
+      "ARGM-TMP",
+      "ARGM-TMP",
+      "O",
+      "ARGM-LOC"
+    ],
+    "qas": [
+      {
+        "id": "kartini_001_q1",
+        "question": "Dimana Kartini lahir?",
+        "answers": [{ "text": "Jepara", "answer_start": 10 }]
+      },
+      {
+        "id": "kartini_001_q2",
+        "question": "Kartini lahir pada tanggal ___?",
+        "answers": [{ "text": "21 April 1879", "answer_start": 6 }]
+      }
+    ]
+  }
+]
--- a/question_generation/data_converted.json
+++ b/question_generation/data_converted.json
--- a/question_generation/model_evaluation_results.txt
+++ b/question_generation/model_evaluation_results.txt
@ -0,0 +1,3 @@
+BLEU Score: 0.0585
+Validation Accuracy: 0.6740
+Validation Loss: 1.8080
--- a/question_generation/predict_qa.py
+++ b/question_generation/predict_qa.py
--- a/question_generation/qa_dataset.json
+++ b/question_generation/qa_dataset.json
@ -0,0 +1,178 @@
+[
+  {
+    "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
+    "tokens": [
+      "raden",
+      "ajeng",
+      "kartini",
+      "lahir",
+      "pada",
+      "21",
+      "april",
+      "1879",
+      "di",
+      "jepara"
+    ],
+    "ner": [
+      "PER",
+      "PER",
+      "PER",
+      "O",
+      "O",
+      "DATE",
+      "DATE",
+      "DATE",
+      "O",
+      "LOC"
+    ],
+    "srl": [
+      "ARG0",
+      "ARG0",
+      "ARG0",
+      "V",
+      "O",
+      "ARGM-TMP",
+      "ARGM-TMP",
+      "ARGM-TMP",
+      "O",
+      "ARGM-LOC"
+    ],
+    "qas": [
+      {
+        "type": "isian",
+        "question": "Dimana kartini lahir ___",
+        "answer": "jepara",
+        "id": "qa_0_q1"
+      },
+      {
+        "type": "true_false",
+        "question": "Kartini lahir pada tanggal 21 mei 1879 ___",
+        "options": [
+          "true",
+          "false"
+        ],
+        "answer": "false",
+        "id": "qa_0_q2"
+      }
+    ]
+  },
+  {
+    "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
+    "tokens": [
+      "kerajaan",
+      "majapahit",
+      "berdiri",
+      "pada",
+      "tahun",
+      "1293",
+      "di",
+      "trowulan"
+    ],
+    "ner": [
+      "O",
+      "ORG",
+      "O",
+      "O",
+      "O",
+      "DATE",
+      "O",
+      "LOC"
+    ],
+    "srl": [
+      "ARG1",
+      "ARG1",
+      "V",
+      "O",
+      "O",
+      "ARGM-TMP",
+      "O",
+      "ARGM-LOC"
+    ],
+    "qas": [
+      {
+        "type": "opsi",
+        "question": "Dimana kerajaan majapahit berdiri ___",
+        "options": [
+          "trowulan",
+          "singasari",
+          "kuta",
+          "banten"
+        ],
+        "answer": "trowulan",
+        "id": "qa_1_q1"
+      },
+      {
+        "type": "true_false",
+        "question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
+        "options": [
+          "true",
+          "false"
+        ],
+        "answer": "false",
+        "id": "qa_1_q2"
+      }
+    ]
+  },
+  {
+    "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
+    "tokens": [
+      "soekarno",
+      "dan",
+      "mohammad",
+      "hatta",
+      "memproklamasikan",
+      "kemerdekaan",
+      "indonesia",
+      "pada",
+      "17",
+      "agustus",
+      "1945"
+    ],
+    "ner": [
+      "PER",
+      "O",
+      "PER",
+      "PER",
+      "O",
+      "O",
+      "LOC",
+      "O",
+      "DATE",
+      "DATE",
+      "DATE"
+    ],
+    "srl": [
+      "ARG0",
+      "O",
+      "ARG0",
+      "ARG0",
+      "V",
+      "ARG1",
+      "ARGM-LOC",
+      "O",
+      "ARGM-TMP",
+      "ARGM-TMP",
+      "ARGM-TMP"
+    ],
+    "qas": [
+      {
+        "type": "isian",
+        "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
+        "answer": "17 agustus 1945",
+        "id": "qa_2_q1"
+      },
+      {
+        "type": "opsi",
+        "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
+        "options": [
+          "soekarno",
+          "mohammad hatta",
+          "sudirman",
+          "ahmad yani"
+        ],
+        "answer": "soekarno mohammad hatta",
+        "id": "qa_2_q2"
+      }
+    ]
+  }
+]
--- a/question_generation/qa_generator_tokenizers.json
+++ b/question_generation/qa_generator_tokenizers.json
--- a/question_generation/qa_model.py
+++ b/question_generation/qa_model.py
@ -0,0 +1,490 @@
+import numpy as np
+import pandas as pd
+import json
+import random
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Model, load_model
+from tensorflow.keras.layers import (
+    Input,
+    LSTM,
+    Dense,
+    Embedding,
+    Bidirectional,
+    Concatenate,
+    Attention,
+    Dropout,
+)
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+import re
+
+
+with open("data_converted.json", "r") as f:
+    data = json.load(f)
+
+
+# Preprocessing function
+def preprocess_text(text):
+    """Melakukan preprocessing teks dasar"""
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+# Persiapkan data untuk model
+def prepare_data(data):
+    """Siapkan data untuk model"""
+    contexts = []
+    tokens_list = []
+    ner_list = []
+    srl_list = []
+    questions = []
+    answers = []
+    q_types = []
+
+    for item in data:
+        for qa in item["qas"]:
+            contexts.append(preprocess_text(item["context"]))
+            tokens_list.append(item["tokens"])
+            ner_list.append(item["ner"])
+            srl_list.append(item["srl"])
+            questions.append(preprocess_text(qa["question"]))
+            answers.append(qa["answer"])
+            q_types.append(qa["type"])
+
+    return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
+
+
+contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
+    data
+)
+
+max_vocab_size = 10000
+tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
+tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
+vocab_size = len(tokenizer.word_index) + 1
+
+# Encoding untuk NER
+ner_tokenizer = Tokenizer(oov_token="<OOV>")
+ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
+ner_vocab_size = len(ner_tokenizer.word_index) + 1
+
+# Encoding untuk SRL
+srl_tokenizer = Tokenizer(oov_token="<OOV>")
+srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
+srl_vocab_size = len(srl_tokenizer.word_index) + 1
+
+# Encoding untuk tipe pertanyaan
+q_type_tokenizer = Tokenizer()
+q_type_tokenizer.fit_on_texts(q_types)
+q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
+
+
+# Konversi token, ner, srl ke sequences
+def tokens_to_sequences(tokens, ner, srl):
+    """Konversi token, ner, dan srl ke sequences"""
+    token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
+    ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
+    srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
+    return token_seqs, ner_seqs, srl_seqs
+
+
+# Menentukan panjang maksimum untuk padding
+context_seqs = tokenizer.texts_to_sequences(contexts)
+question_seqs = tokenizer.texts_to_sequences(questions)
+token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
+
+max_context_len = max([len(seq) for seq in context_seqs])
+max_question_len = max([len(seq) for seq in question_seqs])
+max_token_len = max([len(seq) for seq in token_seqs])
+
+
+# Pad sequences untuk memastikan semua input sama panjang
+def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
+    """Padding semua sequences"""
+    context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
+    question_padded = pad_sequences(
+        question_seqs, maxlen=max_question_len, padding="post"
+    )
+    token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
+    ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
+    srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
+    return context_padded, question_padded, token_padded, ner_padded, srl_padded
+
+
+# Siapkan encoder untuk jawaban
+answer_tokenizer = Tokenizer(oov_token="<OOV>")
+answer_tokenizer.fit_on_texts(answers)
+answer_vocab_size = len(answer_tokenizer.word_index) + 1
+
+# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
+q_type_indices = []
+for q_type in q_types:
+    # Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
+    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
+    q_type_indices.append(q_type_idx)
+
+# Konversi ke numpy array
+q_type_indices = np.array(q_type_indices)
+
+# One-hot encode tipe pertanyaan
+q_type_categorical = tf.keras.utils.to_categorical(
+    q_type_indices, num_classes=q_type_vocab_size
+)
+
+# Pad sequences
+context_padded, question_padded, token_padded, ner_padded, srl_padded = (
+    pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
+)
+
+# Encode jawaban
+answer_seqs = answer_tokenizer.texts_to_sequences(answers)
+max_answer_len = max([len(seq) for seq in answer_seqs])
+answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
+
+# Split data menjadi train dan test sets
+indices = list(range(len(context_padded)))
+train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
+
+
+# Fungsi untuk mendapatkan subset dari data berdasarkan indices
+def get_subset(data, indices):
+    return np.array([data[i] for i in indices])
+
+
+# Train data
+train_context = get_subset(context_padded, train_indices)
+train_question = get_subset(question_padded, train_indices)
+train_token = get_subset(token_padded, train_indices)
+train_ner = get_subset(ner_padded, train_indices)
+train_srl = get_subset(srl_padded, train_indices)
+train_q_type = get_subset(q_type_categorical, train_indices)
+train_answer = get_subset(answer_padded, train_indices)
+
+# Test data
+test_context = get_subset(context_padded, test_indices)
+test_question = get_subset(question_padded, test_indices)
+test_token = get_subset(token_padded, test_indices)
+test_ner = get_subset(ner_padded, test_indices)
+test_srl = get_subset(srl_padded, test_indices)
+test_q_type = get_subset(q_type_categorical, test_indices)
+test_answer = get_subset(answer_padded, test_indices)
+
+# Hyperparameters
+embedding_dim = 100
+lstm_units = 128
+ner_embedding_dim = 50
+srl_embedding_dim = 50
+dropout_rate = 0.3
+
+
+# Function untuk membuat model dengan dua output: pertanyaan dan jawaban
+def create_qa_generator_model():
+    # Input layers
+    context_input = Input(shape=(max_context_len,), name="context_input")
+    token_input = Input(shape=(max_token_len,), name="token_input")
+    ner_input = Input(shape=(max_token_len,), name="ner_input")
+    srl_input = Input(shape=(max_token_len,), name="srl_input")
+
+    # Tidak perlu question_input dan q_type_input untuk proses generasi
+    # karena ini akan menjadi output
+
+    # Shared embedding layer for text
+    text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
+
+    # Embedding untuk NER dan SRL
+    ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
+        ner_input
+    )
+    srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
+        srl_input
+    )
+
+    # Apply embeddings
+    context_embed = text_embedding(context_input)
+    token_embed = text_embedding(token_input)
+
+    # Bi-directional LSTM untuk context dan token-level features
+    context_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="context_lstm")
+    )(context_embed)
+
+    # Concat token features (tokens, NER, SRL)
+    token_features = Concatenate(name="token_features")(
+        [token_embed, ner_embedding, srl_embedding]
+    )
+    token_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="token_lstm")
+    )(token_features)
+
+    # Pool outputs
+    context_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_pool")(context_lstm)
+    token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
+
+    # Concat all features
+    all_features = Concatenate(name="all_features")([context_pool, token_pool])
+
+    # Shared layers
+    shared = Dense(256, activation="relu", name="shared_dense_1")(all_features)
+    shared = Dropout(dropout_rate)(shared)
+    shared = Dense(128, activation="relu", name="shared_dense_2")(shared)
+    shared = Dropout(dropout_rate)(shared)
+
+    # Branch untuk pertanyaan
+    question_branch = Dense(256, activation="relu", name="question_dense")(shared)
+    question_branch = Dropout(dropout_rate)(question_branch)
+
+    # Branch untuk jawaban
+    answer_branch = Dense(256, activation="relu", name="answer_dense")(shared)
+    answer_branch = Dropout(dropout_rate)(answer_branch)
+
+    # Output layers
+    # Untuk pertanyaan, kita buat layer decoder berbasis LSTM untuk menghasilkan sequence kata
+    # sebagai pertanyaan
+    question_decoder = LSTM(lstm_units, return_sequences=True, name="question_decoder")(
+        tf.keras.layers.RepeatVector(max_question_len)(question_branch)
+    )
+    question_output = Dense(vocab_size, activation="softmax", name="question_output")(
+        question_decoder
+    )
+
+    # Output layer untuk jawaban
+    answer_output = Dense(
+        answer_vocab_size, activation="softmax", name="answer_output"
+    )(answer_branch)
+
+    # Create model
+    model = Model(
+        inputs=[
+            context_input,
+            token_input,
+            ner_input,
+            srl_input,
+        ],
+        outputs=[question_output, answer_output],
+    )
+
+    # Compile model dengan loss function dan metrics untuk kedua output
+    model.compile(
+        optimizer="adam",
+        loss={
+            "question_output": "categorical_crossentropy",
+            "answer_output": "sparse_categorical_crossentropy",
+        },
+        metrics={"question_output": "accuracy", "answer_output": "accuracy"},
+        loss_weights={"question_output": 1.0, "answer_output": 1.0},
+    )
+
+    return model
+
+
+# Persiapkan target untuk pertanyaan (one-hot encoded)
+# Untuk pertanyaan, kita perlu mengubah ke format categorical karena kita memprediksi
+# setiap kata di sequence secara bersamaan
+def prepare_question_target(question_padded):
+    question_target = []
+    for question in question_padded:
+        # One-hot encode setiap token dalam sequence
+        sequence_target = []
+        for token in question:
+            # Buat vektor one-hot untuk token ini
+            token_target = tf.keras.utils.to_categorical(token, num_classes=vocab_size)
+            sequence_target.append(token_target)
+        question_target.append(sequence_target)
+    return np.array(question_target)
+
+
+# Siapkan target untuk question output
+train_question_target = prepare_question_target(train_question)
+test_question_target = prepare_question_target(test_question)
+
+# Ubah format jawaban untuk sparse categorical crossentropy
+train_answer_labels = train_answer[:, 0]  # Ambil indeks pertama dari jawaban
+test_answer_labels = test_answer[:, 0]
+
+# Buat model
+model = create_qa_generator_model()
+model.summary()
+
+# Callback untuk menyimpan model terbaik
+checkpoint = ModelCheckpoint(
+    "qa_generator_model.h5",
+    monitor="val_question_output_accuracy",
+    save_best_only=True,
+    verbose=1,
+    mode="max",
+)
+
+early_stop = EarlyStopping(
+    monitor="val_question_output_accuracy", patience=5, verbose=1, mode="max"
+)
+
+# Training
+batch_size = 8
+epochs = 50
+
+# Train model
+history = model.fit(
+    [train_context, train_token, train_ner, train_srl],
+    {"question_output": train_question_target, "answer_output": train_answer_labels},
+    batch_size=batch_size,
+    epochs=epochs,
+    validation_data=(
+        [test_context, test_token, test_ner, test_srl],
+        {"question_output": test_question_target, "answer_output": test_answer_labels},
+    ),
+    callbacks=[checkpoint, early_stop],
+)
+
+model.save("qa_generator_model_final.keras")
+
+# Simpan tokenizer
+tokenizer_data = {
+    "word_tokenizer": tokenizer.to_json(),
+    "ner_tokenizer": ner_tokenizer.to_json(),
+    "srl_tokenizer": srl_tokenizer.to_json(),
+    "answer_tokenizer": answer_tokenizer.to_json(),
+    "q_type_tokenizer": q_type_tokenizer.to_json(),
+    "max_context_len": max_context_len,
+    "max_question_len": max_question_len,
+    "max_token_len": max_token_len,
+}
+
+with open("qa_generator_tokenizers.json", "w") as f:
+    json.dump(tokenizer_data, f)
+
+
+# Fungsi untuk prediksi
+def predict_question_and_answer(model, context, tokens, ner, srl):
+    """
+    Prediksi pertanyaan dan jawaban berdasarkan konteks, tokens, NER, dan SRL
+    """
+    # Preprocess input
+    context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
+    context_padded = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
+
+    token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])
+    token_padded = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
+
+    ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])
+    ner_padded = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
+
+    srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])
+    srl_padded = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
+
+    # Prediksi
+    question_pred, answer_pred = model.predict(
+        [context_padded, token_padded, ner_padded, srl_padded]
+    )
+
+    # Decode pertanyaan (mengambil indeks dengan probabilitas tertinggi di setiap posisi)
+    question_indices = np.argmax(question_pred[0], axis=1)
+    question_words = []
+
+    # Reverse word index untuk mendapatkan kata dari indeks
+    word_index = tokenizer.word_index
+    index_word = {v: k for k, v in word_index.items()}
+
+    # Decode pertanyaan
+    for idx in question_indices:
+        if idx != 0:  # Skip padding (index 0)
+            word = index_word.get(idx, "<UNK>")
+            question_words.append(word)
+        else:
+            break  # Stop at padding
+
+    # Decode jawaban
+    answer_idx = np.argmax(answer_pred[0])
+
+    # Reverse word index untuk jawaban
+    answer_word_index = answer_tokenizer.word_index
+    answer_index_word = {v: k for k, v in answer_word_index.items()}
+
+    answer = answer_index_word.get(answer_idx, "<UNK>")
+
+    # Bentuk pertanyaan
+    question = " ".join(question_words)
+
+    return question, answer
+
+
+# Contoh penggunaan
+# Catatan: Ini hanya contoh, perlu data aktual saat implementasi
+"""
+sample_context = "Selamat pagi, sekarang adalah hari Senin."
+sample_tokens = ["selamat", "pagi", "sekarang", "adalah", "hari", "senin"]
+sample_ner = ["O", "O", "O", "O", "O", "B-TIME"]
+sample_srl = ["B-V", "B-ARG1", "B-ARGM-TMP", "B-ARGM-PRD", "I-ARGM-PRD", "I-ARGM-PRD"]
+
+# Load model yang sudah dilatih
+loaded_model = load_model("qa_generator_model_final.keras")
+
+# Prediksi
+question, answer = predict_question_and_answer(
+    loaded_model, sample_context, sample_tokens, sample_ner, sample_srl
+)
+
+print("Konteks:", sample_context)
+print("Pertanyaan yang dihasilkan:", question)
+print("Jawaban yang dihasilkan:", answer)
+"""
+
+sample = {
+    "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
+    "tokens": [
+        "kerajaan",
+        "majapahit",
+        "berdiri",
+        "pada",
+        "tahun",
+        "1293",
+        "di",
+        "trowulan",
+    ],
+    "ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
+    "srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
+}
+question, answer = predict_question_and_answer(
+    model, sample["context"], sample["tokens"], sample["ner"], sample["srl"]
+)
+
+print("Konteks:", sample["context"])
+print("Pertanyaan yang dihasilkan:", question)
+print("Jawaban yang dihasilkan:", answer)
+
+# Plot history training
+# plt.figure(figsize=(12, 8))
+
+# # Plot loss
+# plt.subplot(2, 2, 1)
+# plt.plot(history.history['loss'])
+# plt.plot(history.history['val_loss'])
+# plt.title('Model Loss')
+# plt.ylabel('Loss')
+# plt.xlabel('Epoch')
+# plt.legend(['Train', 'Validation'], loc='upper right')
+
+# # Plot question output accuracy
+# plt.subplot(2, 2, 2)
+# plt.plot(history.history['question_output_accuracy'])
+# plt.plot(history.history['val_question_output_accuracy'])
+# plt.title('Question Output Accuracy')
+# plt.ylabel('Accuracy')
+# plt.xlabel('Epoch')
+# plt.legend(['Train', 'Validation'], loc='lower right')
+
+# # Plot answer output accuracy
+# plt.subplot(2, 2, 3)
+# plt.plot(history.history['answer_output_accuracy'])
+# plt.plot(history.history['val_answer_output_accuracy'])
+# plt.title('Answer Output Accuracy')
+# plt.ylabel('Accuracy')
+# plt.xlabel('Epoch')
+# plt.legend(['Train', 'Validation'], loc='lower right')
+
+# plt.tight_layout()
+# plt.savefig("training_history.png")
+# plt.show()
--- a/question_generation/qa_tokenizers.json
+++ b/question_generation/qa_tokenizers.json
--- a/question_generation/qg_lstm.ipynb
+++ b/question_generation/qg_lstm.ipynb
--- a/question_generation/qg_lstm_v2.ipynb
+++ b/question_generation/qg_lstm_v2.ipynb
@ -1,615 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "58e41ccb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json, pickle, random\n",
-    "from pathlib import Path\n",
-    "from itertools import chain\n",
-    "\n",
-    "import numpy as np\n",
-    "import tensorflow as tf\n",
-    "from tensorflow.keras.layers import (\n",
-    "    Input, Embedding, LSTM, Concatenate,\n",
-    "    Dense, TimeDistributed\n",
-    ")\n",
-    "from tensorflow.keras.models import Model\n",
-    "from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction\n",
-    "from rouge_score import rouge_scorer, scoring\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "a94dd46a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "flattened samples : 8\n"
-     ]
-    }
-   ],
-   "source": [
-    "RAW = json.loads(Path(\"../dataset/dev_dataset_qg.json\").read_text())\n",
-    "\n",
-    "samples = []\n",
-    "for item in RAW:\n",
-    "    for qp in item[\"quiz_posibility\"]:\n",
-    "        samples.append({\n",
-    "            \"tokens\" : item[\"tokens\"],\n",
-    "            \"ner\"    : item[\"ner\"],\n",
-    "            \"srl\"    : item[\"srl\"],\n",
-    "            \"q_type\" : qp[\"type\"],                 # isian / opsi / benar_salah\n",
-    "            \"q_toks\" : qp[\"question\"] + [\"<eos>\"],\n",
-    "            \"a_toks\" : (qp[\"answer\"] if isinstance(qp[\"answer\"], list)\n",
-    "                         else [qp[\"answer\"]]) + [\"<eos>\"]\n",
-    "        })\n",
-    "\n",
-    "print(\"flattened samples :\", len(samples))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "852fb9a8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def build_vocab(seq_iter, reserved=(\"<pad>\", \"<unk>\", \"<sos>\", \"<eos>\")):\n",
-    "    vocab = {tok: idx for idx, tok in enumerate(reserved)}\n",
-    "    for tok in chain.from_iterable(seq_iter):\n",
-    "        vocab.setdefault(tok, len(vocab))\n",
-    "    return vocab\n",
-    "\n",
-    "vocab_tok = build_vocab((s[\"tokens\"]  for s in samples))\n",
-    "vocab_ner = build_vocab((s[\"ner\"]     for s in samples), reserved=(\"<pad>\",\"<unk>\"))\n",
-    "vocab_srl = build_vocab((s[\"srl\"]     for s in samples), reserved=(\"<pad>\",\"<unk>\"))\n",
-    "vocab_q   = build_vocab((s[\"q_toks\"]  for s in samples))\n",
-    "vocab_a   = build_vocab((s[\"a_toks\"]  for s in samples))\n",
-    "vocab_typ = {\"isian\":0, \"opsi\":1, \"benar_salah\":2}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "fdf696cf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def enc(seq, v): return [v.get(t, v[\"<unk>\"]) for t in seq]\n",
-    "\n",
-    "MAX_SENT = max(len(s[\"tokens\"])  for s in samples)\n",
-    "MAX_Q    = max(len(s[\"q_toks\"]) for s in samples)\n",
-    "MAX_A    = max(len(s[\"a_toks\"]) for s in samples)\n",
-    "\n",
-    "def pad_batch(seqs, vmap, maxlen):\n",
-    "    return tf.keras.preprocessing.sequence.pad_sequences(\n",
-    "        [enc(s, vmap) for s in seqs], maxlen=maxlen, padding=\"post\"\n",
-    "    )\n",
-    "\n",
-    "X_tok = pad_batch((s[\"tokens\"] for s in samples), vocab_tok, MAX_SENT)\n",
-    "X_ner = pad_batch((s[\"ner\"]    for s in samples), vocab_ner, MAX_SENT)\n",
-    "X_srl = pad_batch((s[\"srl\"]    for s in samples), vocab_srl, MAX_SENT)\n",
-    "\n",
-    "dec_q_in  = pad_batch(\n",
-    "    ([[\"<sos>\"]+s[\"q_toks\"][:-1] for s in samples]), vocab_q, MAX_Q)\n",
-    "dec_q_out = pad_batch((s[\"q_toks\"] for s in samples), vocab_q, MAX_Q)\n",
-    "\n",
-    "dec_a_in  = pad_batch(\n",
-    "    ([[\"<sos>\"]+s[\"a_toks\"][:-1] for s in samples]), vocab_a, MAX_A)\n",
-    "dec_a_out = pad_batch((s[\"a_toks\"] for s in samples), vocab_a, MAX_A)\n",
-    "\n",
-    "y_type = np.array([vocab_typ[s[\"q_type\"]] for s in samples])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "33074619",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_2\"</span>\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\u001b[1mModel: \"functional_2\"\u001b[0m\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
-       "┃<span style=\"font-weight: bold\"> Layer (type)        </span>┃<span style=\"font-weight: bold\"> Output Shape      </span>┃<span style=\"font-weight: bold\">    Param # </span>┃<span style=\"font-weight: bold\"> Connected to      </span>┃\n",
-       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
-       "│ tok_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ ner_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ srl_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal_8         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ tok_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_ner (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)    │        <span style=\"color: #00af00; text-decoration-color: #00af00\">352</span> │ ner_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_srl (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)    │        <span style=\"color: #00af00; text-decoration-color: #00af00\">288</span> │ srl_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ expand_dims_4       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>)     │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ not_equal_8[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">ExpandDims</span>)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ broadcast_to_4      │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)   │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ expand_dims_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BroadcastTo</span>)       │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ ones_like_2         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)    │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ emb_ner[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]     │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">OnesLike</span>)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ ones_like_3         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)    │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ emb_srl[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]     │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">OnesLike</span>)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_tok (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)   │      <span style=\"color: #00af00; text-decoration-color: #00af00\">4,992</span> │ tok_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ concatenate_5       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">192</span>)   │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ broadcast_to_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>)       │                   │            │ ones_like_2[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
-       "│                     │                   │            │ ones_like_3[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ dec_q_in            │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>)         │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ concatenate_4       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">192</span>)   │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ emb_tok[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],    │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>)       │                   │            │ emb_ner[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],    │\n",
-       "│                     │                   │            │ emb_srl[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]     │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ any_2 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Any</span>)         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">11</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ concatenate_5[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ dec_a_in            │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>)         │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_q (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)   │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)    │      <span style=\"color: #00af00; text-decoration-color: #00af00\">3,968</span> │ dec_q_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ enc_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>)     │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),     │    <span style=\"color: #00af00; text-decoration-color: #00af00\">459,776</span> │ concatenate_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
-       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),      │            │ any_2[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]       │\n",
-       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)]      │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_a (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)   │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)    │      <span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> │ dec_a_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ lstm_q (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>)       │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),  │    <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ emb_q[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],      │\n",
-       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),      │            │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>],   │\n",
-       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)]      │            │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal_9         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>)         │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ dec_q_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ lstm_a (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>)       │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),  │    <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ emb_a[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],      │\n",
-       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),      │            │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>],   │\n",
-       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)]      │            │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal_10        │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>)         │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ dec_a_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ q_out               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">9</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">31</span>)     │      <span style=\"color: #00af00; text-decoration-color: #00af00\">7,967</span> │ lstm_q[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],     │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>)   │                   │            │ not_equal_9[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ a_out               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">14</span>)     │      <span style=\"color: #00af00; text-decoration-color: #00af00\">3,598</span> │ lstm_a[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],     │\n",
-       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>)   │                   │            │ not_equal_10[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ type_out (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)    │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">3</span>)         │        <span style=\"color: #00af00; text-decoration-color: #00af00\">771</span> │ enc_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
-       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
-       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)       \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape     \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m   Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to     \u001b[0m\u001b[1m \u001b[0m┃\n",
-       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
-       "│ tok_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ ner_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ srl_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal_8         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ tok_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
-       "│ (\u001b[38;5;33mNotEqual\u001b[0m)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_ner (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m)    │        \u001b[38;5;34m352\u001b[0m │ ner_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_srl (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m)    │        \u001b[38;5;34m288\u001b[0m │ srl_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ expand_dims_4       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m1\u001b[0m)     │          \u001b[38;5;34m0\u001b[0m │ not_equal_8[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
-       "│ (\u001b[38;5;33mExpandDims\u001b[0m)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ broadcast_to_4      │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m128\u001b[0m)   │          \u001b[38;5;34m0\u001b[0m │ expand_dims_4[\u001b[38;5;34m0\u001b[0m]… │\n",
-       "│ (\u001b[38;5;33mBroadcastTo\u001b[0m)       │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ ones_like_2         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m)    │          \u001b[38;5;34m0\u001b[0m │ emb_ner[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]     │\n",
-       "│ (\u001b[38;5;33mOnesLike\u001b[0m)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ ones_like_3         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m32\u001b[0m)    │          \u001b[38;5;34m0\u001b[0m │ emb_srl[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]     │\n",
-       "│ (\u001b[38;5;33mOnesLike\u001b[0m)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_tok (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m128\u001b[0m)   │      \u001b[38;5;34m4,992\u001b[0m │ tok_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ concatenate_5       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m192\u001b[0m)   │          \u001b[38;5;34m0\u001b[0m │ broadcast_to_4[\u001b[38;5;34m0\u001b[0m… │\n",
-       "│ (\u001b[38;5;33mConcatenate\u001b[0m)       │                   │            │ ones_like_2[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
-       "│                     │                   │            │ ones_like_3[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ dec_q_in            │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m)         │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
-       "│ (\u001b[38;5;33mInputLayer\u001b[0m)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ concatenate_4       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m, \u001b[38;5;34m192\u001b[0m)   │          \u001b[38;5;34m0\u001b[0m │ emb_tok[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],    │\n",
-       "│ (\u001b[38;5;33mConcatenate\u001b[0m)       │                   │            │ emb_ner[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],    │\n",
-       "│                     │                   │            │ emb_srl[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]     │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ any_2 (\u001b[38;5;33mAny\u001b[0m)         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m11\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ concatenate_5[\u001b[38;5;34m0\u001b[0m]… │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ dec_a_in            │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m)         │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
-       "│ (\u001b[38;5;33mInputLayer\u001b[0m)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_q (\u001b[38;5;33mEmbedding\u001b[0m)   │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m, \u001b[38;5;34m128\u001b[0m)    │      \u001b[38;5;34m3,968\u001b[0m │ dec_q_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ enc_lstm (\u001b[38;5;33mLSTM\u001b[0m)     │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m),     │    \u001b[38;5;34m459,776\u001b[0m │ concatenate_4[\u001b[38;5;34m0\u001b[0m]… │\n",
-       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m),      │            │ any_2[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]       │\n",
-       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)]      │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ emb_a (\u001b[38;5;33mEmbedding\u001b[0m)   │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m, \u001b[38;5;34m128\u001b[0m)    │      \u001b[38;5;34m1,792\u001b[0m │ dec_a_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ lstm_q (\u001b[38;5;33mLSTM\u001b[0m)       │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m, \u001b[38;5;34m256\u001b[0m),  │    \u001b[38;5;34m394,240\u001b[0m │ emb_q[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],      │\n",
-       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m),      │            │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m1\u001b[0m],   │\n",
-       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)]      │            │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m2\u001b[0m]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal_9         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m)         │          \u001b[38;5;34m0\u001b[0m │ dec_q_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
-       "│ (\u001b[38;5;33mNotEqual\u001b[0m)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ lstm_a (\u001b[38;5;33mLSTM\u001b[0m)       │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m, \u001b[38;5;34m256\u001b[0m),  │    \u001b[38;5;34m394,240\u001b[0m │ emb_a[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],      │\n",
-       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m),      │            │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m1\u001b[0m],   │\n",
-       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)]      │            │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m2\u001b[0m]    │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal_10        │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m)         │          \u001b[38;5;34m0\u001b[0m │ dec_a_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
-       "│ (\u001b[38;5;33mNotEqual\u001b[0m)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ q_out               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m9\u001b[0m, \u001b[38;5;34m31\u001b[0m)     │      \u001b[38;5;34m7,967\u001b[0m │ lstm_q[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],     │\n",
-       "│ (\u001b[38;5;33mTimeDistributed\u001b[0m)   │                   │            │ not_equal_9[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ a_out               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m, \u001b[38;5;34m14\u001b[0m)     │      \u001b[38;5;34m3,598\u001b[0m │ lstm_a[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],     │\n",
-       "│ (\u001b[38;5;33mTimeDistributed\u001b[0m)   │                   │            │ not_equal_10[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ type_out (\u001b[38;5;33mDense\u001b[0m)    │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m)         │        \u001b[38;5;34m771\u001b[0m │ enc_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
-       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,271,984</span> (4.85 MB)\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m1,271,984\u001b[0m (4.85 MB)\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,271,984</span> (4.85 MB)\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m1,271,984\u001b[0m (4.85 MB)\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "d_tok, d_tag, units = 128, 32, 256\n",
-    "pad_tok, pad_q, pad_a = vocab_tok[\"<pad>\"], vocab_q[\"<pad>\"], vocab_a[\"<pad>\"]\n",
-    "\n",
-    "# ---- Encoder ----------------------------------------------------\n",
-    "inp_tok = Input((MAX_SENT,), name=\"tok_in\")\n",
-    "inp_ner = Input((MAX_SENT,), name=\"ner_in\")\n",
-    "inp_srl = Input((MAX_SENT,), name=\"srl_in\")\n",
-    "\n",
-    "emb_tok = Embedding(len(vocab_tok), d_tok, mask_zero=True,  name=\"emb_tok\")(inp_tok)\n",
-    "emb_ner = Embedding(len(vocab_ner), d_tag, mask_zero=False, name=\"emb_ner\")(inp_ner)\n",
-    "emb_srl = Embedding(len(vocab_srl), d_tag, mask_zero=False, name=\"emb_srl\")(inp_srl)\n",
-    "\n",
-    "enc_concat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
-    "enc_out, state_h, state_c = LSTM(units, return_state=True, name=\"enc_lstm\")(enc_concat)\n",
-    "\n",
-    "# ---- Decoder : Question ----------------------------------------\n",
-    "dec_q_inp = Input((MAX_Q,), name=\"dec_q_in\")\n",
-    "dec_emb_q = Embedding(len(vocab_q), d_tok, mask_zero=True, name=\"emb_q\")(dec_q_inp)\n",
-    "dec_q_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,\n",
-    "                       name=\"lstm_q\")(dec_emb_q, initial_state=[state_h, state_c])\n",
-    "q_out = TimeDistributed(Dense(len(vocab_q), activation=\"softmax\"), name=\"q_out\")(dec_q_seq)\n",
-    "\n",
-    "# ---- Decoder : Answer ------------------------------------------\n",
-    "dec_a_inp = Input((MAX_A,), name=\"dec_a_in\")\n",
-    "dec_emb_a = Embedding(len(vocab_a), d_tok, mask_zero=True, name=\"emb_a\")(dec_a_inp)\n",
-    "dec_a_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,\n",
-    "                       name=\"lstm_a\")(dec_emb_a, initial_state=[state_h, state_c])\n",
-    "a_out = TimeDistributed(Dense(len(vocab_a), activation=\"softmax\"), name=\"a_out\")(dec_a_seq)\n",
-    "\n",
-    "# ---- Classifier -------------------------------------------------\n",
-    "type_out = Dense(len(vocab_typ), activation=\"softmax\", name=\"type_out\")(enc_out)\n",
-    "\n",
-    "model = Model(\n",
-    "    [inp_tok, inp_ner, inp_srl, dec_q_inp, dec_a_inp],\n",
-    "    [q_out,   a_out,   type_out]\n",
-    ")\n",
-    "\n",
-    "# ---- Masked loss helpers ---------------------------------------\n",
-    "scce = tf.keras.losses.SparseCategoricalCrossentropy(reduction=\"none\")\n",
-    "def masked_loss_factory(pad_id):\n",
-    "    def loss(y_true, y_pred):\n",
-    "        l = scce(y_true, y_pred)\n",
-    "        mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)\n",
-    "        return tf.reduce_sum(l*mask) / tf.reduce_sum(mask)\n",
-    "    return loss\n",
-    "\n",
-    "model.compile(\n",
-    "    optimizer=\"adam\",\n",
-    "    loss = {\"q_out\":masked_loss_factory(pad_q),\n",
-    "            \"a_out\":masked_loss_factory(pad_a),\n",
-    "            \"type_out\":\"sparse_categorical_crossentropy\"},\n",
-    "    loss_weights={\"q_out\":1.0, \"a_out\":1.0, \"type_out\":0.3},\n",
-    "    metrics={\"q_out\":\"sparse_categorical_accuracy\",\n",
-    "             \"a_out\":\"sparse_categorical_accuracy\",\n",
-    "             \"type_out\":tf.keras.metrics.SparseCategoricalAccuracy(name=\"type_acc\")}\n",
-    ")\n",
-    "model.summary()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "44d36899",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1/30\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "Exception encountered when calling BroadcastTo.call().\n\n\u001b[1mFailed to convert elements of (None, 11, 128) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.\u001b[0m\n\nArguments received by BroadcastTo.call():\n  • x=tf.Tensor(shape=(None, 11, 1), dtype=bool)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43m[\u001b[49m\u001b[43mX_tok\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_ner\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_srl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdec_q_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdec_a_in\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43m[\u001b[49m\u001b[43mdec_q_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdec_a_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_type\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvalidation_split\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m    \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m30\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m64\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mtf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeras\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mEarlyStopping\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpatience\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrestore_best_weights\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\n\u001b[1;32m      9\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m model\u001b[38;5;241m.\u001b[39msave(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfull_seq2seq.keras\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# -----------------------------------------------------------------\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;66;03m# 5.  SAVE VOCABS (.pkl keeps python dict intact)\u001b[39;00m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;66;03m# -----------------------------------------------------------------\u001b[39;00m\n",
-      "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    119\u001b[0m     filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m    121\u001b[0m     \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
-      "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    119\u001b[0m     filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m    121\u001b[0m     \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
-      "\u001b[0;31mTypeError\u001b[0m: Exception encountered when calling BroadcastTo.call().\n\n\u001b[1mFailed to convert elements of (None, 11, 128) to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.\u001b[0m\n\nArguments received by BroadcastTo.call():\n  • x=tf.Tensor(shape=(None, 11, 1), dtype=bool)"
-     ]
-    }
-   ],
-   "source": [
-    "history = model.fit(\n",
-    "    [X_tok, X_ner, X_srl, dec_q_in, dec_a_in],\n",
-    "    [dec_q_out, dec_a_out, y_type],\n",
-    "    validation_split=0.1,\n",
-    "    epochs=30,\n",
-    "    batch_size=64,\n",
-    "    callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],\n",
-    "    verbose=2\n",
-    ")\n",
-    "model.save(\"full_seq2seq.keras\")\n",
-    "\n",
-    "\n",
-    "# -----------------------------------------------------------------\n",
-    "# 5.  SAVE VOCABS (.pkl keeps python dict intact)\n",
-    "# -----------------------------------------------------------------\n",
-    "def save_vocab(v, name): pickle.dump(v, open(name,\"wb\"))\n",
-    "save_vocab(vocab_tok,\"vocab_tok.pkl\"); save_vocab(vocab_ner,\"vocab_ner.pkl\")\n",
-    "save_vocab(vocab_srl,\"vocab_srl.pkl\"); save_vocab(vocab_q,  \"vocab_q.pkl\")\n",
-    "save_vocab(vocab_a,  \"vocab_a.pkl\");  save_vocab(vocab_typ,\"vocab_typ.pkl\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "61003de5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def build_inference_models(trained):\n",
-    "    # encoder\n",
-    "    t_in = Input((MAX_SENT,), name=\"t_in\")\n",
-    "    n_in = Input((MAX_SENT,), name=\"n_in\")\n",
-    "    s_in = Input((MAX_SENT,), name=\"s_in\")\n",
-    "    e_t = trained.get_layer(\"emb_tok\")(t_in)\n",
-    "    e_n = trained.get_layer(\"emb_ner\")(n_in)\n",
-    "    e_s = trained.get_layer(\"emb_srl\")(s_in)\n",
-    "    concat = Concatenate()([e_t,e_n,e_s])\n",
-    "    _, h, c = trained.get_layer(\"enc_lstm\")(concat)\n",
-    "    enc_model = Model([t_in,n_in,s_in],[h,c])\n",
-    "\n",
-    "    # question‑decoder\n",
-    "    dq_in = Input((1,), name=\"dq_tok\")\n",
-    "    dh = Input((units,), name=\"dh\"); dc = Input((units,), name=\"dc\")\n",
-    "    dq_emb = trained.get_layer(\"emb_q\")(dq_in)\n",
-    "    dq_lstm, nh, nc = trained.get_layer(\"lstm_q\")(dq_emb, initial_state=[dh,dc])\n",
-    "    dq_out = trained.get_layer(\"q_out\").layer(dq_lstm)\n",
-    "    dec_q_model = Model([dq_in, dh, dc], [dq_out, nh, nc])\n",
-    "\n",
-    "    # answer‑decoder\n",
-    "    da_in = Input((1,), name=\"da_tok\")\n",
-    "    ah = Input((units,), name=\"ah\"); ac = Input((units,), name=\"ac\")\n",
-    "    da_emb = trained.get_layer(\"emb_a\")(da_in)\n",
-    "    da_lstm, nh2, nc2 = trained.get_layer(\"lstm_a\")(da_emb, initial_state=[ah,ac])\n",
-    "    da_out = trained.get_layer(\"a_out\").layer(da_lstm)\n",
-    "    dec_a_model = Model([da_in, ah, ac], [da_out, nh2, nc2])\n",
-    "\n",
-    "    # type classifier\n",
-    "    type_dense = trained.get_layer(\"type_out\")\n",
-    "    type_model = Model([t_in,n_in,s_in], type_dense(_))   # use _ = enc_lstm output\n",
-    "\n",
-    "    return enc_model, dec_q_model, dec_a_model, type_model\n",
-    "\n",
-    "encoder_model, decoder_q, decoder_a, classifier_model = build_inference_models(model)\n",
-    "\n",
-    "inv_q = {v:k for k,v in vocab_q.items()}\n",
-    "inv_a = {v:k for k,v in vocab_a.items()}\n",
-    "\n",
-    "def enc_pad(seq, vmap, maxlen):\n",
-    "    x = [vmap.get(t, vmap[\"<unk>\"]) for t in seq]\n",
-    "    return x + [vmap[\"<pad>\"]] * (maxlen-len(x))\n",
-    "\n",
-    "def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):\n",
-    "    et = np.array([enc_pad(tokens, vocab_tok, MAX_SENT)])\n",
-    "    en = np.array([enc_pad(ner,   vocab_ner, MAX_SENT)])\n",
-    "    es = np.array([enc_pad(srl,   vocab_srl, MAX_SENT)])\n",
-    "\n",
-    "    h,c = encoder_model.predict([et,en,es], verbose=0)\n",
-    "\n",
-    "    # --- question\n",
-    "    q_ids = []\n",
-    "    tgt = np.array([[vocab_q[\"<sos>\"]]])\n",
-    "    for _ in range(max_q):\n",
-    "        logits,h,c = decoder_q.predict([tgt,h,c], verbose=0)\n",
-    "        nxt = int(logits[0,-1].argmax())\n",
-    "        if nxt==vocab_q[\"<eos>\"]: break\n",
-    "        q_ids.append(nxt)\n",
-    "        tgt = np.array([[nxt]])\n",
-    "\n",
-    "    # --- answer (re‑use fresh h,c)\n",
-    "    h,c = encoder_model.predict([et,en,es], verbose=0)\n",
-    "    a_ids = []\n",
-    "    tgt = np.array([[vocab_a[\"<sos>\"]]])\n",
-    "    for _ in range(max_a):\n",
-    "        logits,h,c = decoder_a.predict([tgt,h,c], verbose=0)\n",
-    "        nxt = int(logits[0,-1].argmax())\n",
-    "        if nxt==vocab_a[\"<eos>\"]: break\n",
-    "        a_ids.append(nxt)\n",
-    "        tgt = np.array([[nxt]])\n",
-    "\n",
-    "    # --- type\n",
-    "    t_id = int(classifier_model.predict([et,en,es], verbose=0).argmax())\n",
-    "\n",
-    "    return [inv_q[i] for i in q_ids], [inv_a[i] for i in a_ids], \\\n",
-    "           [k for k,v in vocab_typ.items() if v==t_id][0]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5279b631",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "test_tokens = [\"soekarno\",\"membacakan\",\"teks\",\"proklamasi\",\"pada\",\n",
-    "               \"17\",\"agustus\",\"1945\"]\n",
-    "test_ner = [\"B-PER\",\"O\",\"O\",\"O\",\"O\",\"B-DATE\",\"I-DATE\",\"I-DATE\"]\n",
-    "test_srl = [\"ARG0\",\"V\",\"ARG1\",\"ARG1\",\"O\",\"ARGM-TMP\",\"ARGM-TMP\",\"ARGM-TMP\"]\n",
-    "\n",
-    "q,a,t = greedy_decode(test_tokens,test_ner,test_srl,max_q=MAX_Q,max_a=MAX_A)\n",
-    "print(\"\\nDEMO\\n----\")\n",
-    "print(\"Q :\", \" \".join(q))\n",
-    "print(\"A :\", \" \".join(a))\n",
-    "print(\"T :\", t)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "850d4905",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "smooth = SmoothingFunction().method4\n",
-    "r_scorer = rouge_scorer.RougeScorer([\"rouge1\",\"rougeL\"], use_stemmer=True)\n",
-    "\n",
-    "def strip_special(seq, pad_id, eos_id):\n",
-    "    return [x for x in seq if x not in (pad_id, eos_id)]\n",
-    "\n",
-    "def ids_to_text(ids, inv):\n",
-    "    return \" \".join(inv[i] for i in ids)\n",
-    "\n",
-    "def evaluate(n=200):\n",
-    "    idxs = random.sample(range(len(samples)), n)\n",
-    "    refs, hyps = [], []\n",
-    "    agg = scoring.BootstrapAggregator()\n",
-    "\n",
-    "    for i in idxs:\n",
-    "        gt_ids = strip_special(dec_q_out[i], pad_q, vocab_q[\"<eos>\"])\n",
-    "        ref = ids_to_text(gt_ids, inv_q)\n",
-    "        pred = \" \".join(greedy_decode(\n",
-    "            samples[i][\"tokens\"],\n",
-    "            samples[i][\"ner\"],\n",
-    "            samples[i][\"srl\"]\n",
-    "        )[0])\n",
-    "        refs.append([ref.split()])\n",
-    "        hyps.append(pred.split())\n",
-    "        agg.add_scores(r_scorer.score(ref, pred))\n",
-    "\n",
-    "    bleu = corpus_bleu(refs, hyps, smoothing_function=smooth)\n",
-    "    r1 = agg.aggregate()[\"rouge1\"].mid\n",
-    "    rL = agg.aggregate()[\"rougeL\"].mid\n",
-    "\n",
-    "    print(f\"\\nEVAL (n={n})\")\n",
-    "    print(f\"BLEU‑4  : {bleu:.4f}\")\n",
-    "    print(f\"ROUGE‑1 : P={r1.precision:.3f} R={r1.recall:.3f} F1={r1.fmeasure:.3f}\")\n",
-    "    print(f\"ROUGE‑L : P={rL.precision:.3f} R={rL.recall:.3f} F1={rL.fmeasure:.3f}\")\n",
-    "\n",
-    "evaluate(2) "
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "myenv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/question_generation/question_answer_model.ipynb
+++ b/question_generation/question_answer_model.ipynb
--- a/question_generation/question_generation_model.ipynb
+++ b/question_generation/question_generation_model.ipynb
--- a/question_generation/question_model.py
+++ b/question_generation/question_model.py
@ -0,0 +1,357 @@
+import numpy as np
+import pandas as pd
+import json
+import random
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Model, load_model
+from tensorflow.keras.layers import (
+    Input,
+    LSTM,
+    Dense,
+    Embedding,
+    Bidirectional,
+    Concatenate,
+    Attention,
+    Dropout,
+)
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+import re
+import string
+from collections import Counter
+
+# Load data
+with open("data_converted.json", "r") as f:
+    data = json.load(f)
+
+
+# Preprocessing function
+def preprocess_text(text):
+    """Melakukan preprocessing teks dasar"""
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+# Persiapkan data untuk model prediksi pertanyaan
+def prepare_question_prediction_data(data):
+    """Siapkan data untuk model prediksi pertanyaan"""
+    contexts = []
+    tokens_list = []
+    ner_list = []
+    srl_list = []
+    questions = []
+    answers = []
+    q_types = []
+
+    for item in data:
+        for qa in item["qas"]:
+            contexts.append(preprocess_text(item["context"]))
+            tokens_list.append(item["tokens"])
+            ner_list.append(item["ner"])
+            srl_list.append(item["srl"])
+            questions.append(preprocess_text(qa["question"]))
+            q_types.append(qa["type"])
+
+    return contexts, tokens_list, ner_list, srl_list, questions, q_types
+
+
+# Siapkan data
+contexts, tokens_list, ner_list, srl_list, questions, q_types = (
+    prepare_question_prediction_data(data)
+)
+
+# Tokenizer untuk teks (context, question, answer)
+max_vocab_size = 10000
+tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
+all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
+tokenizer.fit_on_texts(all_texts)
+vocab_size = len(tokenizer.word_index) + 1
+
+# Encoding untuk NER
+ner_tokenizer = Tokenizer(oov_token="<OOV>")
+ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
+ner_vocab_size = len(ner_tokenizer.word_index) + 1
+
+# Encoding untuk SRL
+srl_tokenizer = Tokenizer(oov_token="<OOV>")
+srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
+srl_vocab_size = len(srl_tokenizer.word_index) + 1
+
+# Encoding untuk tipe pertanyaan
+q_type_tokenizer = Tokenizer()
+q_type_tokenizer.fit_on_texts(q_types)
+q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
+
+
+# Konversi token, ner, srl ke sequences
+def tokens_to_sequences(tokens, ner, srl):
+    """Konversi token, ner, dan srl ke sequences"""
+    token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
+    ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
+    srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
+    return token_seqs, ner_seqs, srl_seqs
+
+
+# Sequences
+context_seqs = tokenizer.texts_to_sequences(contexts)
+question_seqs = tokenizer.texts_to_sequences(questions)
+token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
+
+# Menentukan panjang maksimum untuk padding
+max_context_len = max([len(seq) for seq in context_seqs])
+max_question_len = max([len(seq) for seq in question_seqs])
+max_token_len = max([len(seq) for seq in token_seqs])
+
+
+# Pad sequences untuk memastikan semua input sama panjang
+def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
+    """Padding semua sequences"""
+    context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
+    token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
+    ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
+    srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
+    question_padded = pad_sequences(
+        question_seqs, maxlen=max_question_len, padding="post"
+    )
+    return (
+        context_padded,
+        token_padded,
+        ner_padded,
+        srl_padded,
+        question_padded,
+    )
+
+
+# Encode tipe pertanyaan
+q_type_indices = []
+for q_type in q_types:
+    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
+    q_type_indices.append(q_type_idx)
+
+# Konversi ke numpy array
+q_type_indices = np.array(q_type_indices)
+
+# One-hot encode tipe pertanyaan
+q_type_categorical = tf.keras.utils.to_categorical(
+    q_type_indices, num_classes=q_type_vocab_size
+)
+
+# Pad sequences
+context_padded, token_padded, ner_padded, srl_padded, question_padded = (
+    pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
+)
+
+# Split data menjadi train dan test sets
+indices = list(range(len(context_padded)))
+train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
+
+
+# Fungsi untuk mendapatkan subset dari data berdasarkan indices
+def get_subset(data, indices):
+    return np.array([data[i] for i in indices])
+
+
+# Train data
+train_context = get_subset(context_padded, train_indices)
+train_token = get_subset(token_padded, train_indices)
+train_ner = get_subset(ner_padded, train_indices)
+train_srl = get_subset(srl_padded, train_indices)
+train_q_type = get_subset(q_type_categorical, train_indices)
+train_question = get_subset(question_padded, train_indices)
+
+# Test data
+test_context = get_subset(context_padded, test_indices)
+test_token = get_subset(token_padded, test_indices)
+test_ner = get_subset(ner_padded, test_indices)
+test_srl = get_subset(srl_padded, test_indices)
+test_q_type = get_subset(q_type_categorical, test_indices)
+test_question = get_subset(question_padded, test_indices)
+
+# Hyperparameters
+embedding_dim = 100
+lstm_units = 128
+ner_embedding_dim = 50
+srl_embedding_dim = 50
+dropout_rate = 0.3
+
+
+# Function untuk membuat model prediksi pertanyaan
+def create_question_prediction_model():
+    # Input layers
+    context_input = Input(shape=(max_context_len,), name="context_input")
+    token_input = Input(shape=(max_token_len,), name="token_input")
+    ner_input = Input(shape=(max_token_len,), name="ner_input")
+    srl_input = Input(shape=(max_token_len,), name="srl_input")
+    q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
+
+    # Shared embedding layer for text
+    text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
+
+    # Embedding untuk NER dan SRL
+    ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
+        ner_input
+    )
+    srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
+        srl_input
+    )
+
+    # Apply embeddings
+    context_embed = text_embedding(context_input)
+    token_embed = text_embedding(token_input)
+
+    # Bi-directional LSTM untuk context dan token-level features
+    context_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="context_lstm")
+    )(context_embed)
+
+    # Concat token features (tokens, NER, SRL)
+    token_features = Concatenate(name="token_features")(
+        [token_embed, ner_embedding, srl_embedding]
+    )
+    token_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="token_lstm")
+    )(token_features)
+
+    context_attention = tf.keras.layers.Attention(name="context_attention")(
+        context_lstm
+    )
+
+    # Pool attention outputs
+    context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
+        context_attention
+    )
+    token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
+
+    # Concat all features
+    all_features = Concatenate(name="all_features")(
+        [context_att_pool, token_pool, q_type_input]
+    )
+
+    # Dense layers with expanded capacity for sequence generation
+    x = Dense(512, activation="relu", name="dense_1")(all_features)
+    x = Dropout(dropout_rate)(x)
+    x = Dense(256, activation="relu", name="dense_2")(x)
+    x = Dropout(dropout_rate)(x)
+
+    # Reshape untuk sequence decoder
+    decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
+
+    # Many-to-many architecture for sequence generation
+    # Decoder LSTM
+    decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
+
+    # Reshape untuk input ke decoder
+    decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
+    decoder_input_reshaped = tf.keras.layers.Reshape((1, lstm_units * 2))(decoder_input)
+
+    # Decoder sequence with teacher forcing
+    # Expand dimensionality to match expected sequence length
+    repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
+
+    # Process through decoder LSTM
+    decoder_outputs = decoder_lstm(repeated_vector)
+
+    # Apply dense layer to each timestep
+    question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
+        decoder_outputs
+    )
+
+    # Create model
+    model = Model(
+        inputs=[
+            context_input,
+            token_input,
+            ner_input,
+            srl_input,
+            q_type_input,
+        ],
+        outputs=question_output_seq,
+    )
+
+    # Compile model with categorical crossentropy for sequence prediction
+    model.compile(
+        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+    )
+
+    return model
+
+
+# Buat model
+model = create_question_prediction_model()
+model.summary()
+
+# Callback untuk menyimpan model terbaik
+checkpoint = ModelCheckpoint(
+    "question_prediction_model.h5",
+    monitor="val_accuracy",
+    save_best_only=True,
+    verbose=1,
+)
+
+early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
+
+# Reshaping question data for sequence-to-sequence training
+# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
+train_question_target = np.expand_dims(train_question, -1)
+test_question_target = np.expand_dims(test_question, -1)
+
+# Training parameters
+batch_size = 8
+epochs = 50
+
+# Train model
+history = model.fit(
+    [train_context, train_token, train_ner, train_srl, train_q_type],
+    train_question_target,
+    batch_size=batch_size,
+    epochs=epochs,
+    validation_data=(
+        [test_context, test_token, test_ner, test_srl, test_q_type],
+        test_question_target,
+    ),
+    callbacks=[checkpoint, early_stop],
+)
+
+# # Plot training history
+# plt.figure(figsize=(12, 4))
+# plt.subplot(1, 2, 1)
+# plt.plot(history.history['accuracy'])
+# plt.plot(history.history['val_accuracy'])
+# plt.title('Model Accuracy')
+# plt.ylabel('Accuracy')
+# plt.xlabel('Epoch')
+# plt.legend(['Train', 'Validation'], loc='upper left')
+
+# plt.subplot(1, 2, 2)
+# plt.plot(history.history['loss'])
+# plt.plot(history.history['val_loss'])
+# plt.title('Model Loss')
+# plt.ylabel('Loss')
+# plt.xlabel('Epoch')
+# plt.legend(['Train', 'Validation'], loc='upper left')
+# plt.tight_layout()
+# plt.savefig('question_prediction_training_history.png')
+# plt.show()
+
+# Simpan model dan tokenizer
+model.save("question_prediction_model_final.h5")
+
+# Simpan tokenizer
+tokenizer_data = {
+    "word_tokenizer": tokenizer.to_json(),
+    "ner_tokenizer": ner_tokenizer.to_json(),
+    "srl_tokenizer": srl_tokenizer.to_json(),
+    "q_type_tokenizer": q_type_tokenizer.to_json(),
+    "max_context_len": max_context_len,
+    "max_question_len": max_question_len,
+    "max_token_len": max_token_len,
+}
+
+with open("question_prediction_tokenizers.json", "w") as f:
+    json.dump(tokenizer_data, f)
+
+print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
--- a/question_generation/question_model_v2.py
+++ b/question_generation/question_model_v2.py
@ -0,0 +1,473 @@
+import numpy as np
+import json
+import random
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Model, load_model
+from tensorflow.keras.layers import (
+    Input,
+    LSTM,
+    Dense,
+    Embedding,
+    Bidirectional,
+    Concatenate,
+    Dropout,
+)
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+import re
+from rouge_score import rouge_scorer
+from nltk.translate.bleu_score import sentence_bleu
+
+# Load data
+with open("data_converted.json", "r") as f:
+    data = json.load(f)
+
+
+# Preprocessing function
+def preprocess_text(text):
+    """Melakukan preprocessing teks dasar"""
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+# Persiapkan data untuk model prediksi pertanyaan
+def prepare_question_prediction_data(data):
+    """Siapkan data untuk model prediksi pertanyaan"""
+    contexts = []
+    tokens_list = []
+    ner_list = []
+    srl_list = []
+    questions = []
+    q_types = []
+
+    for item in data:
+        for qa in item["qas"]:
+            contexts.append(preprocess_text(item["context"]))
+            tokens_list.append(item["tokens"])
+            ner_list.append(item["ner"])
+            srl_list.append(item["srl"])
+            questions.append(preprocess_text(qa["question"]))
+            q_types.append(qa["type"])
+            # Tidak mengambil jawaban (answer) sebagai input
+
+    return contexts, tokens_list, ner_list, srl_list, questions, q_types
+
+
+# Siapkan data
+contexts, tokens_list, ner_list, srl_list, questions, q_types = (
+    prepare_question_prediction_data(data)
+)
+
+# Tokenizer untuk teks (context, question)
+max_vocab_size = 10000
+tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
+all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
+tokenizer.fit_on_texts(all_texts)
+vocab_size = len(tokenizer.word_index) + 1
+
+# Encoding untuk NER
+ner_tokenizer = Tokenizer(oov_token="<OOV>")
+ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
+ner_vocab_size = len(ner_tokenizer.word_index) + 1
+
+# Encoding untuk SRL
+srl_tokenizer = Tokenizer(oov_token="<OOV>")
+srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
+srl_vocab_size = len(srl_tokenizer.word_index) + 1
+
+# Encoding untuk tipe pertanyaan
+q_type_tokenizer = Tokenizer()
+q_type_tokenizer.fit_on_texts(q_types)
+q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
+
+
+# Konversi token, ner, srl ke sequences
+def tokens_to_sequences(tokens, ner, srl):
+    """Konversi token, ner, dan srl ke sequences"""
+    token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
+    ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
+    srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
+    return token_seqs, ner_seqs, srl_seqs
+
+
+# Sequences
+context_seqs = tokenizer.texts_to_sequences(contexts)
+question_seqs = tokenizer.texts_to_sequences(questions)
+token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
+
+# Menentukan panjang maksimum untuk padding
+max_context_len = max([len(seq) for seq in context_seqs])
+max_question_len = max([len(seq) for seq in question_seqs])
+max_token_len = max([len(seq) for seq in token_seqs])
+
+
+# Pad sequences untuk memastikan semua input sama panjang
+def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
+    """Padding semua sequences"""
+    context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
+    token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
+    ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
+    srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
+    question_padded = pad_sequences(
+        question_seqs, maxlen=max_question_len, padding="post"
+    )
+    return (
+        context_padded,
+        token_padded,
+        ner_padded,
+        srl_padded,
+        question_padded,
+    )
+
+
+# Encode tipe pertanyaan
+q_type_indices = []
+for q_type in q_types:
+    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
+    q_type_indices.append(q_type_idx)
+
+# Konversi ke numpy array
+q_type_indices = np.array(q_type_indices)
+
+# One-hot encode tipe pertanyaan
+q_type_categorical = tf.keras.utils.to_categorical(
+    q_type_indices, num_classes=q_type_vocab_size
+)
+
+# Pad sequences
+context_padded, token_padded, ner_padded, srl_padded, question_padded = (
+    pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
+)
+
+# Split data menjadi train dan test sets
+indices = list(range(len(context_padded)))
+train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
+
+
+# Fungsi untuk mendapatkan subset dari data berdasarkan indices
+def get_subset(data, indices):
+    return np.array([data[i] for i in indices])
+
+
+# Train data
+train_context = get_subset(context_padded, train_indices)
+train_token = get_subset(token_padded, train_indices)
+train_ner = get_subset(ner_padded, train_indices)
+train_srl = get_subset(srl_padded, train_indices)
+train_q_type = get_subset(q_type_categorical, train_indices)
+train_question = get_subset(question_padded, train_indices)
+
+# Test data
+test_context = get_subset(context_padded, test_indices)
+test_token = get_subset(token_padded, test_indices)
+test_ner = get_subset(ner_padded, test_indices)
+test_srl = get_subset(srl_padded, test_indices)
+test_q_type = get_subset(q_type_categorical, test_indices)
+test_question = get_subset(question_padded, test_indices)
+
+# Hyperparameters
+embedding_dim = 100
+lstm_units = 128
+ner_embedding_dim = 50
+srl_embedding_dim = 50
+dropout_rate = 0.3
+
+
+# Function untuk membuat model prediksi pertanyaan
+def create_question_prediction_model():
+    # Input layers
+    context_input = Input(shape=(max_context_len,), name="context_input")
+    token_input = Input(shape=(max_token_len,), name="token_input")
+    ner_input = Input(shape=(max_token_len,), name="ner_input")
+    srl_input = Input(shape=(max_token_len,), name="srl_input")
+    q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
+
+    # Shared embedding layer for text
+    text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
+
+    # Embedding untuk NER dan SRL
+    ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
+        ner_input
+    )
+    srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
+        srl_input
+    )
+
+    # Apply embeddings
+    context_embed = text_embedding(context_input)
+    token_embed = text_embedding(token_input)
+
+    # Bi-directional LSTM untuk context dan token-level features
+    context_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="context_lstm")
+    )(context_embed)
+
+    # Concat token features (tokens, NER, SRL)
+    token_features = Concatenate(name="token_features")(
+        [token_embed, ner_embedding, srl_embedding]
+    )
+    token_lstm = Bidirectional(
+        LSTM(lstm_units, return_sequences=True, name="token_lstm")
+    )(token_features)
+
+    # Apply attention to context LSTM
+    context_attention = tf.keras.layers.Attention(name="context_attention")(
+        [context_lstm, context_lstm]
+    )
+
+    # Pool attention outputs
+    context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
+        context_attention
+    )
+    token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
+
+    # Concat all features (tidak ada answer feature)
+    all_features = Concatenate(name="all_features")(
+        [context_att_pool, token_pool, q_type_input]
+    )
+
+    # Dense layers with expanded capacity for sequence generation
+    x = Dense(512, activation="relu", name="dense_1")(all_features)
+    x = Dropout(dropout_rate)(x)
+    x = Dense(256, activation="relu", name="dense_2")(x)
+    x = Dropout(dropout_rate)(x)
+
+    # Reshape untuk sequence decoder
+    decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
+
+    # Many-to-many architecture for sequence generation
+    # Decoder LSTM
+    decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
+
+    # Reshape untuk input ke decoder
+    decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
+
+    # Decoder sequence with teacher forcing
+    # Expand dimensionality to match expected sequence length
+    repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
+
+    # Process through decoder LSTM
+    decoder_outputs = decoder_lstm(repeated_vector)
+
+    # Apply dense layer to each timestep
+    question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
+        decoder_outputs
+    )
+
+    # Create model
+    model = Model(
+        inputs=[
+            context_input,
+            token_input,
+            ner_input,
+            srl_input,
+            q_type_input,
+        ],
+        outputs=question_output_seq,
+    )
+
+    # Compile model with categorical crossentropy for sequence prediction
+    model.compile(
+        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+    )
+
+    return model
+
+
+# Buat model
+model = create_question_prediction_model()
+model.summary()
+
+# Callback untuk menyimpan model terbaik
+checkpoint = ModelCheckpoint(
+    "question_prediction_model.h5",
+    monitor="val_accuracy",
+    save_best_only=True,
+    verbose=1,
+)
+
+early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
+
+# Reshaping question data for sequence-to-sequence training
+# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
+train_question_target = np.expand_dims(train_question, -1)
+test_question_target = np.expand_dims(test_question, -1)
+
+# Training parameters
+batch_size = 8
+epochs = 50
+
+# Train model
+history = model.fit(
+    [train_context, train_token, train_ner, train_srl, train_q_type],
+    train_question_target,
+    batch_size=batch_size,
+    epochs=epochs,
+    validation_data=(
+        [test_context, test_token, test_ner, test_srl, test_q_type],
+        test_question_target,
+    ),
+    callbacks=[checkpoint, early_stop],
+)
+
+# Plot training history
+plt.figure(figsize=(12, 4))
+plt.subplot(1, 2, 1)
+plt.plot(history.history["accuracy"])
+plt.plot(history.history["val_accuracy"])
+plt.title("Model Accuracy")
+plt.ylabel("Accuracy")
+plt.xlabel("Epoch")
+plt.legend(["Train", "Validation"], loc="upper left")
+
+plt.subplot(1, 2, 2)
+plt.plot(history.history["loss"])
+plt.plot(history.history["val_loss"])
+plt.title("Model Loss")
+plt.ylabel("Loss")
+plt.xlabel("Epoch")
+plt.legend(["Train", "Validation"], loc="upper left")
+plt.tight_layout()
+plt.savefig("question_prediction_training_history.png")
+plt.show()
+
+# Simpan model dan tokenizer
+model.save("question_prediction_model_final.h5")
+
+# Simpan tokenizer
+tokenizer_data = {
+    "word_tokenizer": tokenizer.to_json(),
+    "ner_tokenizer": ner_tokenizer.to_json(),
+    "srl_tokenizer": srl_tokenizer.to_json(),
+    "q_type_tokenizer": q_type_tokenizer.to_json(),
+    "max_context_len": max_context_len,
+    "max_question_len": max_question_len,
+    "max_token_len": max_token_len,
+}
+
+with open("question_prediction_tokenizers.json", "w") as f:
+    json.dump(tokenizer_data, f)
+
+print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
+
+
+# Fungsi untuk memprediksi pertanyaan
+def predict_question(context, tokens, ner, srl, q_type):
+    context = preprocess_text(context)
+
+    context_seq = tokenizer.texts_to_sequences([context])[0]
+    token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])[0]
+    ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
+    srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
+
+    context_padded = pad_sequences(
+        [context_seq], maxlen=max_context_len, padding="post"
+    )
+    token_padded = pad_sequences([token_seq], maxlen=max_token_len, padding="post")
+    ner_padded = pad_sequences([ner_seq], maxlen=max_token_len, padding="post")
+    srl_padded = pad_sequences([srl_seq], maxlen=max_token_len, padding="post")
+
+    # Q-type one-hot encoding
+    q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
+    q_type_one_hot = tf.keras.utils.to_categorical(
+        [q_type_idx], num_classes=q_type_vocab_size
+    )
+
+    # Predict
+    pred = model.predict(
+        [context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot],
+        verbose=1,
+    )
+
+    # Convert prediction to words
+    pred_seq = np.argmax(pred[0], axis=1)
+
+    # Convert indices to words
+    reverse_word_map = {v: k for k, v in tokenizer.word_index.items()}
+    pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
+
+    return " ".join(pred_words)
+
+
+def evaluate_model_performance(test_data):
+
+    # Initialize ROUGE scorer
+    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
+
+    # Lists to store scores
+    bleu_scores = []
+    rouge1_scores = []
+    rouge2_scores = []
+    rougel_scores = []
+
+    # Iterate through test data
+    for i in range(len(test_data)):
+        # Get test sample
+        sample_context = contexts[test_data[i]]
+        sample_tokens = tokens_list[test_data[i]]
+        sample_ner = ner_list[test_data[i]]
+        sample_srl = srl_list[test_data[i]]
+        sample_q_type = q_types[test_data[i]]
+        actual_question = questions[test_data[i]]
+
+        # Predict question
+        pred_question = predict_question(
+            sample_context, sample_tokens, sample_ner, sample_srl, sample_q_type
+        )
+
+        # Tokenize for BLEU score
+        actual_tokens = actual_question.split()
+        pred_tokens = pred_question.split()
+
+        # Calculate BLEU score
+        # Using unigram, bigram, trigram, and 4-gram
+        print("kaliamt aktual", actual_tokens)
+        print("kaliamt prediksi", pred_tokens)
+        bleu_score = sentence_bleu([actual_tokens], pred_tokens)
+        bleu_scores.append(bleu_score)
+
+        try:
+            rouge_scores = scorer.score(actual_question, pred_question)
+
+            # Extract F1 scores
+            rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
+            rouge2_scores.append(rouge_scores["rouge2"].fmeasure)
+            rougel_scores.append(rouge_scores["rougeL"].fmeasure)
+        except Exception as e:
+            print(f"Error calculating ROUGE score: {e}")
+
+    # Calculate average scores
+    results = {
+        "avg_bleu_score": np.mean(bleu_scores),
+        "avg_rouge1": np.mean(rouge1_scores),
+        "avg_rouge2": np.mean(rouge2_scores),
+        "avg_rougel": np.mean(rougel_scores),
+    }
+
+    return results
+
+
+loaded_model = load_model("question_prediction_model_final.h5")
+
+with open("question_prediction_tokenizers.json", "r") as f:
+    tokenizer_data = json.load(f)
+
+# Ambil beberapa sampel dari data test
+sample_idx = random.randint(0, len(test_indices) - 1)
+sample_context = contexts[test_indices[sample_idx]]
+sample_tokens = tokens_list[test_indices[sample_idx]]
+sample_ner = ner_list[test_indices[sample_idx]]
+sample_srl = srl_list[test_indices[sample_idx]]
+sample_q_type = q_types[test_indices[sample_idx]]
+
+performance_metrics = evaluate_model_performance(test_indices)
+
+print("\nModel Performance Metrics:")
+print(f"Average BLEU Score: {performance_metrics['avg_bleu_score']:.4f}")
+print(f"Average ROUGE-1 Score: {performance_metrics['avg_rouge1']:.4f}")
+print(f"Average ROUGE-2 Score: {performance_metrics['avg_rouge2']:.4f}")
+print(f"Average ROUGE-L Score: {performance_metrics['avg_rougel']:.4f}")
--- a/question_generation/question_predict.py
+++ b/question_generation/question_predict.py
@ -0,0 +1,210 @@
+import numpy as np
+import json
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import load_model
+import re
+
+class QuestionPredictionModel:
+    def __init__(self, model_path, tokenizer_path):
+        """
+        Initialize question prediction model with pre-trained model and tokenizers
+        """
+        # Load model
+        self.model = load_model(model_path)
+        
+        # Load tokenizers
+        with open(tokenizer_path, 'r') as f:
+            tokenizer_data = json.load(f)
+        
+        # Reconstruct tokenizers
+        self.word_tokenizer = tokenizer_from_json(tokenizer_data['word_tokenizer'])
+        self.ner_tokenizer = tokenizer_from_json(tokenizer_data['ner_tokenizer'])
+        self.srl_tokenizer = tokenizer_from_json(tokenizer_data['srl_tokenizer'])
+        self.q_type_tokenizer = tokenizer_from_json(tokenizer_data['q_type_tokenizer'])
+        
+        # Get max lengths
+        self.max_context_len = tokenizer_data['max_context_len']
+        self.max_answer_len = tokenizer_data['max_answer_len']
+        self.max_question_len = tokenizer_data['max_question_len']
+        self.max_token_len = tokenizer_data['max_token_len']
+        
+        # Get vocabulary sizes
+        self.vocab_size = len(self.word_tokenizer.word_index) + 1
+        self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
+    
+    def preprocess_text(self, text):
+        """Basic text preprocessing"""
+        text = text.lower()
+        text = re.sub(r"\s+", " ", text).strip()
+        return text
+    
+    def predict_question(self, context, answer, tokens, ner, srl, q_type):
+        """
+        Predict a question based on given context, answer, tokens, NER, SRL, and question type
+        
+        Args:
+            context (str): The context text
+            answer (str): The answer to generate a question for
+            tokens (list): List of tokens
+            ner (list): List of NER tags corresponding to tokens
+            srl (list): List of SRL tags corresponding to tokens
+            q_type (str): Question type ('isian', 'opsi', or 'true_false')
+            
+        Returns:
+            str: The predicted question
+        """
+        # Preprocess inputs
+        context = self.preprocess_text(context)
+        answer = self.preprocess_text(answer)
+        
+        # Convert to sequences
+        context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
+        answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0]
+        tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
+        ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
+        srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
+        
+        # Pad sequences
+        context_padded = pad_sequences([context_seq], maxlen=self.max_context_len, padding="post")
+        answer_padded = pad_sequences([answer_seq], maxlen=self.max_answer_len, padding="post")
+        tokens_padded = pad_sequences([tokens_seq], maxlen=self.max_token_len, padding="post")
+        ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
+        srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
+        
+        # One-hot encode question type
+        q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
+        q_type_categorical = tf.keras.utils.to_categorical(
+            [q_type_idx], num_classes=self.q_type_vocab_size
+        )
+        
+        # Make prediction
+        predicted_seq = self.model.predict(
+            [context_padded, answer_padded, tokens_padded, ner_padded, srl_padded, q_type_categorical]
+        )
+        
+        # Convert predictions to tokens (taking the highest probability token at each position)
+        predicted_indices = np.argmax(predicted_seq[0], axis=1)
+        
+        # Create reversed word index for converting indices back to words
+        reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()}
+        
+        # Convert indices to words
+        predicted_words = []
+        for idx in predicted_indices:
+            if idx != 0:  # Skip padding tokens
+                predicted_words.append(reverse_word_index.get(idx, ''))
+        
+        # Form the question
+        predicted_question = ' '.join(predicted_words)
+        
+        # Add "___" to the end based on question type convention
+        if "___" not in predicted_question:
+            predicted_question += " ___"
+        
+        return predicted_question
+    
+    def batch_predict_questions(self, data):
+        """
+        Predict questions for a batch of data
+        
+        Args:
+            data (list): List of dictionaries with context, tokens, ner, srl, and answers
+            
+        Returns:
+            list: List of predicted questions
+        """
+        results = []
+        
+        for item in data:
+            context = item["context"]
+            tokens = item["tokens"]
+            ner = item["ner"]
+            srl = item["srl"]
+            
+            # If there are Q&A pairs, use them for evaluation
+            if "qas" in item:
+                for qa in item["qas"]:
+                    answer = qa["answer"]
+                    q_type = qa["type"]
+                    ground_truth = qa["question"]
+                    
+                    predicted_question = self.predict_question(
+                        context, answer, tokens, ner, srl, q_type
+                    )
+                    
+                    results.append({
+                        "context": context,
+                        "answer": answer,
+                        "predicted_question": predicted_question,
+                        "ground_truth": ground_truth,
+                        "question_type": q_type
+                    })
+            else:
+                # If no Q&A pairs, generate questions for all question types
+                for q_type in ["isian", "true_false", "opsi"]:
+                    # For demo purposes, use a placeholder answer (would need actual answers in real use)
+                    # In practice, you might extract potential answers from the context
+                    placeholders = {
+                        "isian": "placeholder",
+                        "true_false": "true",
+                        "opsi": "placeholder"
+                    }
+                    
+                    predicted_question = self.predict_question(
+                        context, placeholders[q_type], tokens, ner, srl, q_type
+                    )
+                    
+                    results.append({
+                        "context": context,
+                        "predicted_question": predicted_question,
+                        "question_type": q_type
+                    })
+        
+        return results
+
+
+# Example usage
+if __name__ == "__main__":
+    # Load test data
+    with open("data_converted.json", "r") as f:
+        test_data = json.load(f)
+    
+    # Initialize model
+    question_predictor = QuestionPredictionModel(
+        model_path="question_prediction_model_final.h5",
+        tokenizer_path="question_prediction_tokenizers.json"
+    )
+    
+    # Example single prediction
+    sample = test_data[0]
+    context = sample["context"]
+    tokens = sample["tokens"]
+    ner = sample["ner"]
+    srl = sample["srl"]
+    answer = sample["qas"][0]["answer"]
+    q_type = sample["qas"][0]["type"]
+    
+    predicted_question = question_predictor.predict_question(
+        context, answer, tokens, ner, srl, q_type
+    )
+    
+    print(f"Context: {context}")
+    print(f"Answer: {answer}")
+    print(f"Question Type: {q_type}")
+    print(f"Predicted Question: {predicted_question}")
+    print(f"Ground Truth: {sample['qas'][0]['question']}")
+    
+    # Batch prediction
+    results = question_predictor.batch_predict_questions(test_data[:3])
+    
+    print("\nBatch Results:")
+    for i, result in enumerate(results):
+        print(f"\nResult {i+1}:")
+        print(f"Context: {result['context']}")
+        print(f"Answer: {result.get('answer', 'N/A')}")
+        print(f"Question Type: {result['question_type']}")
+        print(f"Predicted Question: {result['predicted_question']}")
+        if 'ground_truth' in result:
+            print(f"Ground Truth: {result['ground_truth']}")
--- a/question_generation/question_predict_v2.py
+++ b/question_generation/question_predict_v2.py
@ -0,0 +1,188 @@
+import numpy as np
+import json
+import tensorflow as tf
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import load_model
+import re
+
+
+class QuestionPredictionModel:
+    def __init__(self, model_path, tokenizer_path):
+        """
+        Initialize question prediction model with pre-trained model and tokenizers
+        """
+        # Load model
+        self.model = load_model(model_path)
+
+        # Load tokenizers
+        with open(tokenizer_path, "r") as f:
+            tokenizer_data = json.load(f)
+
+        # Reconstruct tokenizers
+        self.word_tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
+        self.ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
+        self.srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
+        self.q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
+
+        # Get max lengths
+        self.max_context_len = tokenizer_data["max_context_len"]
+        self.max_question_len = tokenizer_data["max_question_len"]
+        self.max_token_len = tokenizer_data["max_token_len"]
+
+        # Get vocabulary sizes
+        self.vocab_size = len(self.word_tokenizer.word_index) + 1
+        self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
+
+    def preprocess_text(self, text):
+        """Basic text preprocessing"""
+        text = text.lower()
+        text = re.sub(r"\s+", " ", text).strip()
+        return text
+
+    def predict_question(self, context, tokens, ner, srl, q_type):
+        """Prediksi pertanyaan berdasarkan konteks dan fitur lainnya"""
+        # Preprocess
+        context = self.preprocess_text(context)
+
+        # Convert to sequences
+        context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
+        token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
+        ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
+        srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
+
+        # Pad sequences
+        context_padded = pad_sequences(
+            [context_seq], maxlen=self.max_context_len, padding="post"
+        )
+        token_padded = pad_sequences(
+            [token_seq], maxlen=self.max_token_len, padding="post"
+        )
+        ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
+        srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
+
+        # Q-type one-hot encoding
+        q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
+        q_type_one_hot = tf.keras.utils.to_categorical(
+            [q_type_idx], num_classes=self.q_type_vocab_size
+        )
+
+        # Predict
+        pred = self.model.predict(
+            [context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot]
+        )
+
+        # Convert prediction to words
+        pred_seq = np.argmax(pred[0], axis=1)
+
+        # Convert indices to words
+        reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()}
+        pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
+
+        return " ".join(pred_words)
+
+    def batch_predict_questions(self, data):
+        """
+        Predict questions for a batch of data
+
+        Args:
+            data (list): List of dictionaries with context, tokens, ner, srl, and answers
+
+        Returns:
+            list: List of predicted questions
+        """
+        results = []
+
+        for item in data:
+            context = item["context"]
+            tokens = item["tokens"]
+            ner = item["ner"]
+            srl = item["srl"]
+
+            # If there are Q&A pairs, use them for evaluation
+            if "qas" in item:
+                for qa in item["qas"]:
+                    q_type = qa["type"]
+                    ground_truth = qa["question"]
+
+                    predicted_question = self.predict_question(
+                        context, tokens, ner, srl, q_type
+                    )
+
+                    results.append(
+                        {
+                            "context": context,
+                            "predicted_question": predicted_question,
+                            "ground_truth": ground_truth,
+                            "question_type": q_type,
+                        }
+                    )
+            else:
+                # If no Q&A pairs, generate questions for all question types
+                for q_type in ["isian", "true_false", "opsi"]:
+                    # For demo purposes, use a placeholder answer (would need actual answers in real use)
+                    # In practice, you might extract potential answers from the context
+                    placeholders = {
+                        "isian": "placeholder",
+                        "true_false": "true",
+                        "opsi": "placeholder",
+                    }
+
+                    predicted_question = self.predict_question(
+                        context, placeholders[q_type], tokens, ner, srl, q_type
+                    )
+
+                    results.append(
+                        {
+                            "context": context,
+                            "predicted_question": predicted_question,
+                            "question_type": q_type,
+                        }
+                    )
+
+        return results
+
+
+# Example usage
+if __name__ == "__main__":
+    # Load test data
+    with open("data_converted.json", "r") as f:
+        test_data = json.load(f)
+
+    # Initialize model
+    question_predictor = QuestionPredictionModel(
+        model_path="question_prediction_model_final.h5",
+        tokenizer_path="question_prediction_tokenizers.json",
+    )
+
+    # Example single prediction
+    sample = test_data[0]
+    context = sample["context"]
+    tokens = sample["tokens"]
+    ner = sample["ner"]
+    srl = sample["srl"]
+    answer = sample["qas"][0]["answer"]
+    q_type = sample["qas"][0]["type"]
+
+    predicted_question = question_predictor.predict_question(
+        context, tokens, ner, srl, q_type
+    )
+
+    print(f"Context: {context}")
+    print(f"Answer: {answer}")
+    print(f"Question Type: {q_type}")
+    print(f"Predicted Question: {predicted_question}")
+    print(f"Ground Truth: {sample['qas'][0]['question']}")
+
+    # Batch prediction
+    # results = question_predictor.batch_predict_questions(test_data[:3])
+
+    # print("\nBatch Results:")
+    # for i, result in enumerate(results):
+    #     print(f"\nResult {i+1}:")
+    #     print(f"Context: {result['context']}")
+    #     print(f"Answer: {result.get('answer', 'N/A')}")
+    #     print(f"Question Type: {result['question_type']}")
+    #     print(f"Predicted Question: {result['predicted_question']}")
+    #     if "ground_truth" in result:
+    #         print(f"Ground Truth: {result['ground_truth']}")
--- a/question_generation/question_prediction_tokenizers.json
+++ b/question_generation/question_prediction_tokenizers.json
--- a/question_generation/question_prediction_training_history.png
+++ b/question_generation/question_prediction_training_history.png
--- a/question_generation/stabel_train_qg_dataset.json
+++ b/question_generation/stabel_train_qg_dataset.json
--- a/question_generation/training_history.png
+++ b/question_generation/training_history.png