In [53]:
import json
from pathlib import Path

file_path = "../dataset/dev_dataset_qg.json"


raw_content = Path(file_path).read_text()
RAW = json.loads(raw_content)

samples = []
for idx, item in enumerate(RAW):
    try:
        if not isinstance(item, dict):
            print(
                f"[TypeError] RAW[{idx}]:  Expected dict, got {type(item)} with value: {item}"
            )
            continue

        for qp in item["qas"]:
            samp = {
                "tokens": [tok.lower() for tok in item["tokens"]],
                "ner": item["ner"],
                "srl": item["srl"],
                "q_type": qp["type"],
                "q_toks": [tok.lower() for tok in qp["question"]] + ["<eos>"],
            }
            if isinstance(qp["answer"], list):
                samp["a_toks"] = [tok.lower() for tok in qp["answer"]] + ["<eos>"]
            else:
                samp["a_toks"] = [qp["answer"].lower(), "<eos>"]
            samples.append(samp)

    except KeyError as e:
        print(f"[KeyError] RAW[{idx}]: Missing key {e}. TOKENS: {item['tokens']}")
    except Exception as e:
        print(f"[Unexpected Error] RAW[{idx}]: {e}. TOKENS: {item['tokens']}")

print("Total flattened samples:", len(samples))

[TypeError] RAW[260]:  Expected dict, got <class 'str'> with value: PER
[Unexpected Error] RAW[915]: 'list' object has no attribute 'lower'. TOKENS: ['Setelah', 'masuk', 'Islam', 'raja', 'berganti', 'nama', 'menjadi', 'Sultan', 'Isma’il', 'Syah', 'Zill', 'Allah', 'fi', 'al-Alam', 'dan', 'juga', 'ketiga', 'orang', 'putra', 'dan', 'putrinya', 'yaitu', 'Sultan', 'Mudaffar', 'Syah', 'Siti', 'Aisyah', 'dan', 'Sultan', 'Mansyur']
Total flattened samples: 1423


In [42]:
from itertools import chain


def build_vocab(seq_iter, reserved=["<pad>", "<unk>", "<sos>", "<eos>"]):
    vocab = {tok: idx for idx, tok in enumerate(reserved)}
    for tok in chain.from_iterable(seq_iter):
        if tok not in vocab:
            vocab[tok] = len(vocab)
    return vocab


vocab_tok = build_vocab((s["tokens"] for s in samples))
vocab_ner = build_vocab((s["ner"] for s in samples), reserved=["<pad>", "<unk>"])
vocab_srl = build_vocab((s["srl"] for s in samples), reserved=["<pad>", "<unk>"])
vocab_q = build_vocab((s["q_toks"] for s in samples))
vocab_a = build_vocab((s["a_toks"] for s in samples))

vocab_typ = {"isian": 0, "opsi": 1, "true_false": 2}

print(vocab_q)

{'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3, 'dimana': 4, 'kartini': 5, 'lahir': 6, '___': 7, 'pada': 8, 'tanggal': 9, '21': 10, 'mei': 11, '1879': 12, 'kerajaan': 13, 'majapahit': 14, 'berdiri': 15, 'tahun': 16, '1300': 17, 'berapa': 18, 'kemerdekaan': 19, 'indonesia': 20, 'diproklamasikan': 21, 'siapa': 22, 'yang': 23, 'memproklamasikan': 24, 'lama': 25, 'bumi': 26, 'mengelilingi': 27, 'matahari': 28, 'presiden': 29, 'pertama': 30, 'planet': 31, 'apa': 32, 'paling': 33, 'dekat': 34, 'dengan': 35, 'venus': 36, 'memiliki': 37, 'suhu': 38, 'permukaan': 39, 'tinggi': 40, 'dikenal': 41, 'sebagai': 42, 'merah': 43, 'terbesar': 44, 'di': 45, 'tata': 46, 'surya': 47, 'terkenal': 48, 'cincin': 49, 'indah': 50, 'berwarna': 51, 'biru': 52, 'jauh': 53, 'dari': 54, 'apakah': 55, 'pluto': 56, 'masih': 57, 'dianggap': 58, 'soekarno': 59, 'membacakan': 60, 'teks': 61, 'proklamasi': 62, 'kapan': 63, 'sebutkan': 64, 'dibacakan': 65, 'andi': 66, 'melakukan': 67, 'pergi': 68, 'ke': 69, 'jakarta': 70

In [43]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences


def encode(seq, vmap):  # token → id
    return [vmap.get(t, vmap["<unk>"]) for t in seq]


MAX_SENT = max(len(s["tokens"]) for s in samples)
MAX_Q = max(len(s["q_toks"]) for s in samples)
MAX_A = max(len(s["a_toks"]) for s in samples)

X_tok = pad_sequences(
    [encode(s["tokens"], vocab_tok) for s in samples], maxlen=MAX_SENT, padding="post"
)
X_ner = pad_sequences(
    [encode(s["ner"], vocab_ner) for s in samples], maxlen=MAX_SENT, padding="post"
)
X_srl = pad_sequences(
    [encode(s["srl"], vocab_srl) for s in samples], maxlen=MAX_SENT, padding="post"
)

# Decoder input = <sos> + target[:-1]
dec_q_in = pad_sequences(
    [[vocab_q["<sos>"], *encode(s["q_toks"][:-1], vocab_q)] for s in samples],
    maxlen=MAX_Q,
    padding="post",
)
dec_q_out = pad_sequences(
    [encode(s["q_toks"], vocab_q) for s in samples], maxlen=MAX_Q, padding="post"
)

dec_a_in = pad_sequences(
    [[vocab_a["<sos>"], *encode(s["a_toks"][:-1], vocab_a)] for s in samples],
    maxlen=MAX_A,
    padding="post",
)
dec_a_out = pad_sequences(
    [encode(s["a_toks"], vocab_a) for s in samples], maxlen=MAX_A, padding="post"
)
y_type = np.array([vocab_typ[s["q_type"]] for s in samples])

MAX_SENT = max(len(s["tokens"]) for s in samples)
MAX_Q = max(len(s["q_toks"]) for s in samples)
MAX_A = max(len(s["a_toks"]) for s in samples)

In [44]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input,
    Embedding,
    LSTM,
    Concatenate,
    Dense,
    TimeDistributed,
)
from tensorflow.keras.models import Model

# ---- constants ---------------------------------------------------
d_tok = 32  # token embedding dim
d_tag = 16  # NER / SRL embedding dim
units = 64

# ---- encoder -----------------------------------------------------
inp_tok = Input((MAX_SENT,), name="tok_in")
inp_ner = Input((MAX_SENT,), name="ner_in")
inp_srl = Input((MAX_SENT,), name="srl_in")

# make ALL streams mask the same way (here: no masking,
# we'll just pad with 0s and let the LSTM ignore them)
emb_tok = Embedding(len(vocab_tok), d_tok, mask_zero=False, name="embedding_tok")(
    inp_tok
)
emb_ner = Embedding(len(vocab_ner), d_tag, mask_zero=False, name="embedding_ner")(
    inp_ner
)
emb_srl = Embedding(len(vocab_srl), d_tag, mask_zero=False, name="embedding_srl")(
    inp_srl
)

enc_concat = Concatenate()([emb_tok, emb_ner, emb_srl])
enc_out, state_h, state_c = LSTM(units, return_state=True, name="encoder_lstm")(
    enc_concat
)


# ---------- DECODER : Question ----------
dec_q_inp = Input(shape=(MAX_Q,), name="dec_q_in")
dec_emb_q = Embedding(len(vocab_q), d_tok, mask_zero=True, name="embedding_q_decoder")(
    dec_q_inp
)
dec_q, _, _ = LSTM(
    units, return_state=True, return_sequences=True, name="lstm_q_decoder"
)(dec_emb_q, initial_state=[state_h, state_c])
q_out = TimeDistributed(
    Dense(len(vocab_q), activation="softmax", name="dense_q_output"), name="q_output"
)(dec_q)

# ---------- DECODER : Answer ----------
dec_a_inp = Input(shape=(MAX_A,), name="dec_a_in")
dec_emb_a = Embedding(len(vocab_a), d_tok, mask_zero=True, name="embedding_a_decoder")(
    dec_a_inp
)
dec_a, _, _ = LSTM(
    units, return_state=True, return_sequences=True, name="lstm_a_decoder"
)(dec_emb_a, initial_state=[state_h, state_c])
a_out = TimeDistributed(
    Dense(len(vocab_a), activation="softmax", name="dense_a_output"), name="a_output"
)(dec_a)

# ---------- CLASSIFIER : Question Type ----------
type_out = Dense(len(vocab_typ), activation="softmax", name="type_output")(enc_out)

model = Model(
    inputs=[inp_tok, inp_ner, inp_srl, dec_q_inp, dec_a_inp],
    outputs=[q_out, a_out, type_out],
)

model.summary()

In [45]:
losses = {
    "q_output": "sparse_categorical_crossentropy",
    "a_output": "sparse_categorical_crossentropy",
    "type_output": "sparse_categorical_crossentropy",
}
loss_weights = {"q_output": 1.0, "a_output": 1.0, "type_output": 0.3}

model.compile(
    optimizer="adam",
    loss=losses,
    loss_weights=loss_weights,
    metrics={
        "q_output": "sparse_categorical_accuracy",
        "a_output": "sparse_categorical_accuracy",
        "type_output": "accuracy",
    },
)

history = model.fit(
    [X_tok, X_ner, X_srl, dec_q_in, dec_a_in],
    [dec_q_out, dec_a_out, y_type],
    validation_split=0.1,
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],
    verbose=1,
)

model.save("full_seq2seq.keras")

import json
import pickle

# def save_vocab(vocab, path):
#     with open(path, "w", encoding="utf-8") as f:
#         json.dump(vocab, f, ensure_ascii=False, indent=2)

# # Simpan semua vocab
# save_vocab(vocab_tok, "vocab_tok.json")
# save_vocab(vocab_ner, "vocab_ner.json")
# save_vocab(vocab_srl, "vocab_srl.json")
# save_vocab(vocab_q,   "vocab_q.json")
# save_vocab(vocab_a,   "vocab_a.json")
# save_vocab(vocab_typ, "vocab_typ.json")


def save_vocab_pkl(vocab, path):
    with open(path, "wb") as f:
        pickle.dump(vocab, f)


# Simpan semua vocab
save_vocab_pkl(vocab_tok, "vocab_tok.pkl")
save_vocab_pkl(vocab_ner, "vocab_ner.pkl")
save_vocab_pkl(vocab_srl, "vocab_srl.pkl")
save_vocab_pkl(vocab_q, "vocab_q.pkl")
save_vocab_pkl(vocab_a, "vocab_a.pkl")
save_vocab_pkl(vocab_typ, "vocab_typ.pkl")

Epoch 1/30
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 46ms/step - a_output_loss: 6.2977 - a_output_sparse_categorical_accuracy: 0.0491 - loss: 13.0486 - q_output_loss: 6.5023 - q_output_sparse_categorical_accuracy: 0.0484 - type_output_accuracy: 0.6917 - type_output_loss: 0.8213 - val_a_output_loss: 5.7298 - val_a_output_sparse_categorical_accuracy: 0.0833 - val_loss: 11.6245 - val_q_output_loss: 5.8765 - val_q_output_sparse_categorical_accuracy: 0.0949 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0610
Epoch 2/30
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - a_output_loss: 5.4265 - a_output_sparse_categorical_accuracy: 0.0833 - loss: 11.3418 - q_output_loss: 5.7511 - q_output_sparse_categorical_accuracy: 0.0848 - type_output_accuracy: 0.8554 - type_output_loss: 0.5355 - val_a_output_loss: 4.9895 - val_a_output_sparse_categorical_accuracy: 0.0833 - val_loss: 9.5240 - val_q_output_loss: 4.4785 - val_q_output_sparse_catego

In [46]:
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Input, Concatenate

# === Load Model Utama ===
model = load_model("full_seq2seq.keras")


# === Load Vocabulary dari .pkl ===
def load_vocab(path):
    with open(path, "rb") as f:
        return pickle.load(f)


vocab_tok = load_vocab("vocab_tok.pkl")
vocab_ner = load_vocab("vocab_ner.pkl")
vocab_srl = load_vocab("vocab_srl.pkl")
vocab_q = load_vocab("vocab_q.pkl")
vocab_a = load_vocab("vocab_a.pkl")
vocab_typ = load_vocab("vocab_typ.pkl")

inv_vocab_q = {v: k for k, v in vocab_q.items()}
inv_vocab_a = {v: k for k, v in vocab_a.items()}

# === Build Encoder Model ===
MAX_SENT = model.input_shape[0][1]  # Ambil shape dari model yang diload
MAX_Q = model.input_shape[3][1]  # Max length for question
MAX_A = model.input_shape[4][1]  # Max length for answer

inp_tok_g = Input(shape=(MAX_SENT,), name="tok_in_g")
inp_ner_g = Input(shape=(MAX_SENT,), name="ner_in_g")
inp_srl_g = Input(shape=(MAX_SENT,), name="srl_in_g")

emb_tok = model.get_layer("embedding_tok").call(inp_tok_g)
emb_ner = model.get_layer("embedding_ner").call(inp_ner_g)
emb_srl = model.get_layer("embedding_srl").call(inp_srl_g)

enc_concat = Concatenate(name="concat_encoder")([emb_tok, emb_ner, emb_srl])

encoder_lstm = model.get_layer("encoder_lstm")
enc_out, state_h, state_c = encoder_lstm(enc_concat)

# Create encoder model with full output including enc_out
encoder_model = Model(
    inputs=[inp_tok_g, inp_ner_g, inp_srl_g],
    outputs=[enc_out, state_h, state_c],
    name="encoder_model",
)

# === Build Decoder for Question ===
dec_q_inp = Input(shape=(1,), name="dec_q_in")
dec_emb_q = model.get_layer("embedding_q_decoder").call(dec_q_inp)

state_h_dec = Input(shape=(units,), name="state_h_dec")
state_c_dec = Input(shape=(units,), name="state_c_dec")

lstm_decoder_q = model.get_layer("lstm_q_decoder")

dec_out_q, state_h_q, state_c_q = lstm_decoder_q(
    dec_emb_q, initial_state=[state_h_dec, state_c_dec]
)

q_time_dist_layer = model.get_layer("q_output")
dense_q = q_time_dist_layer.layer
q_output = dense_q(dec_out_q)

decoder_q = Model(
    inputs=[dec_q_inp, state_h_dec, state_c_dec],
    outputs=[q_output, state_h_q, state_c_q],
    name="decoder_question_model",
)

# === Build Decoder for Answer ===
dec_a_inp = Input(shape=(1,), name="dec_a_in")
dec_emb_a = model.get_layer("embedding_a_decoder").call(dec_a_inp)

state_h_a = Input(shape=(units,), name="state_h_a")
state_c_a = Input(shape=(units,), name="state_c_a")

lstm_decoder_a = model.get_layer("lstm_a_decoder")

dec_out_a, state_h_a_out, state_c_a_out = lstm_decoder_a(
    dec_emb_a, initial_state=[state_h_a, state_c_a]
)

a_time_dist_layer = model.get_layer("a_output")
dense_a = a_time_dist_layer.layer
a_output = dense_a(dec_out_a)

decoder_a = Model(
    inputs=[dec_a_inp, state_h_a, state_c_a],
    outputs=[a_output, state_h_a_out, state_c_a_out],
    name="decoder_answer_model",
)

# === Build Classifier for Question Type ===
type_dense = model.get_layer("type_output")
type_out = type_dense(enc_out)

classifier_model = Model(
    inputs=[inp_tok_g, inp_ner_g, inp_srl_g], outputs=type_out, name="classifier_model"
)

In [47]:
def encode(seq, vmap):
    return [vmap.get(tok, vmap["<unk>"]) for tok in seq]


def encode_and_pad(seq, vmap, max_len=MAX_SENT):
    encoded = [vmap.get(tok, vmap["<unk>"]) for tok in seq]
    # Pad with vocab["<pad>"] to the right if sequence is shorter than max_len
    padded = encoded + [vmap["<pad>"]] * (max_len - len(encoded))
    return padded[:max_len]  # Ensure it doesn't exceed max_len


def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):
    # --- encode encoder inputs -------------------------------------------
    if isinstance(tokens, np.ndarray):
        enc_tok = tokens
        enc_ner = ner
        enc_srl = srl
    else:
        enc_tok = np.array([encode_and_pad(tokens, vocab_tok, MAX_SENT)])
        enc_ner = np.array([encode_and_pad(ner, vocab_ner, MAX_SENT)])
        enc_srl = np.array([encode_and_pad(srl, vocab_srl, MAX_SENT)])

    # --- Get encoder outputs ---
    enc_out, h, c = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)

    # QUESTION Decoding
    tgt = np.array([[vocab_q["<sos>"]]])
    question_ids = []
    for _ in range(max_q):
        logits, h, c = decoder_q.predict([tgt, h, c], verbose=0)
        next_id = int(logits[0, 0].argmax())  # Get the predicted token ID
        if next_id == vocab_q["<eos>"]:
            break
        question_ids.append(next_id)
        tgt = np.array([[next_id]])  # Feed the predicted token back as input

    # ANSWER Decoding - use encoder outputs again for fresh state
    _, h, c = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)
    tgt = np.array([[vocab_a["<sos>"]]])
    answer_ids = []
    for _ in range(max_a):
        logits, h, c = decoder_a.predict([tgt, h, c], verbose=0)
        next_id = int(logits[0, 0].argmax())
        if next_id == vocab_a["<eos>"]:
            break
        answer_ids.append(next_id)
        tgt = np.array([[next_id]])

    # Question Type
    qtype_logits = classifier_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)
    qtype_id = int(qtype_logits.argmax())

    # Final output
    question = [inv_vocab_q.get(i, "<unk>") for i in question_ids]
    answer = [inv_vocab_a.get(i, "<unk>") for i in answer_ids]
    q_type = [k for k, v in vocab_typ.items() if v == qtype_id][0]

    return question, answer, q_type


def test_model():
    test_data = {
        "tokens": [
            "joko",
            "opik",
            "widodo",
            "lahir",
            "pada",
            "27",
            "maret",
            "1992",
            "di",
            "solo",
        ],
        "ner": [
            "B-PER",
            "I-PER",
            "I-PER",
            "V",
            "O",
            "B-DATE",
            "I-DATE",
            "I-DATE",
            "O",
            "B-LOC",
        ],
        "srl": [
            "ARG0",
            "ARG0",
            "ARG0",
            "V",
            "O",
            "ARGM-TMP",
            "ARGM-TMP",
            "ARGM-TMP",
            "O",
            "ARGM-LOC",
        ],
    }
    # tokens = [
    #     "soekarno",
    #     "membacakan",
    #     "teks",
    #     "proklamasi",
    #     "pada",
    #     "17",
    #     "agustus",
    #     "1945",
    # ]
    # ner_tags = ["B-PER", "O", "O", "O", "O", "B-DATE", "I-DATE", "I-DATE"]
    # srl_tags = ["ARG0", "V", "ARG1", "ARG1", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP"]

    question, answer, q_type = greedy_decode(
        test_data["tokens"], test_data["ner"], test_data["srl"]
    )
    print(f"Generated Question: {' '.join(question)}")
    print(f"Generated Answer  : {' '.join(answer)}")
    print(f"Question Type     : {q_type}")


test_model()

Generated Question: siapa yang lahir di ___
Generated Answer  : 4 juli 1927
Question Type     : isian


In [48]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

smoothie = SmoothingFunction().method4
scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)


# Helper to strip special ids
def strip_special(ids, vocab):
    pad = vocab["<pad>"] if "<pad>" in vocab else None
    eos = vocab["<eos>"]
    return [i for i in ids if i not in (pad, eos)]


def ids_to_text(ids, inv_vocab):
    return " ".join(inv_vocab[i] for i in ids)


# ---- evaluation over a set of indices ----
import random


def evaluate(indices=None):
    if indices is None:
        indices = random.sample(range(len(X_tok)), k=min(100, len(X_tok)))

    bleu_scores, rou1, rouL = [], [], []
    for idx in indices:
        # Ground truth
        gt_q = strip_special(dec_q_out[idx], vocab_q)
        gt_a = strip_special(dec_a_out[idx], vocab_a)
        # Prediction
        q_pred, a_pred, _ = greedy_decode(
            X_tok[idx : idx + 1], X_ner[idx : idx + 1], X_srl[idx : idx + 1]
        )

        # BLEU on question tokens
        bleu_scores.append(
            sentence_bleu(
                [[inv_vocab_q[i] for i in gt_q]], q_pred, smoothing_function=smoothie
            )
        )
        # ROUGE on question strings
        r = scorer.score(ids_to_text(gt_q, inv_vocab_q), " ".join(q_pred))
        rou1.append(r["rouge1"].fmeasure)
        rouL.append(r["rougeL"].fmeasure)

    print(f"BLEU  : {np.mean(bleu_scores) * 100:.2f}%")
    print(f"ROUGE1: {np.mean(rou1) * 100:.2f}% | ROUGE-L: {np.mean(rouL) * 100:.2f}%")


evaluate()

BLEU  : 10.75%
ROUGE1: 27.63% | ROUGE-L: 27.63%
