In [1]:
# -------------------------------------------------
# 0. Import & Konfigurasi
# -------------------------------------------------
import json, pickle
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import (
 Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,
 TimeDistributed
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

PAD_TOKEN = ""
UNK_TOKEN = "UNK"
START_TOKEN = ""
END_TOKEN = ""
MAXLEN_SRC = 100 # Panjang paragraf maksimal
MAXLEN_TGT = 40 # Panjang pertanyaan/jawaban maksimal
BATCH = 32
EPOCHS = 30

2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746863381.035097 166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746863381.038

In [2]:
raw = json.loads(Path("normalize_dataset.json").read_text(encoding="utf-8"))

req = {"tokens","ner","srl","question","answer","type"}
valid, bad = [], []
for i,item in enumerate(raw):
 if (isinstance(item,dict) and not (req-item.keys())
 and all(isinstance(item[k],list) for k in req-{"type"})
 and isinstance(item["type"],str)):
 valid.append(item)
 else:
 bad.append(i)

print(f"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})")

Valid 325 / 325 (invalid index: [])


In [3]:
for ex in valid:
 ex["question_in"] = [START_TOKEN] + ex["question"]
 ex["question_out"] = ex["question"] + [END_TOKEN]

 ex["answer_in"] = [START_TOKEN] + ex["answer"]
 ex["answer_out"] = ex["answer"] + [END_TOKEN]

In [4]:
tok_token = Tokenizer(oov_token=UNK_TOKEN, filters="")
tok_ner = Tokenizer(lower=False, filters="")
tok_srl = Tokenizer(lower=False, filters="")
tok_q = Tokenizer(oov_token=UNK_TOKEN, filters="")
tok_a = Tokenizer(oov_token=UNK_TOKEN, filters="")
tok_type = Tokenizer(lower=False, filters="")

tok_token.fit_on_texts([ex["tokens"] for ex in valid])
tok_ner.fit_on_texts([ex["ner"] for ex in valid])
tok_srl.fit_on_texts([ex["srl"] for ex in valid])
tok_q.fit_on_texts([ex["question_in"]+ex["question_out"] for ex in valid])
tok_a.fit_on_texts([ex["answer_in"]+ex["answer_out"] for ex in valid])
tok_type.fit_on_texts([ex["type"] for ex in valid])

# +1 utk padding
vocab_token = len(tok_token.word_index)+1
vocab_ner = len(tok_ner.word_index)+1
vocab_srl = len(tok_srl.word_index)+1
vocab_q = len(tok_q.word_index)+1
vocab_a = len(tok_a.word_index)+1
vocab_type = len(tok_type.word_index)

In [5]:
def seqs(field, tok, maxlen):
 return pad_sequences(
 tok.texts_to_sequences([ex[field] for ex in valid]),
 maxlen=maxlen, padding="post"
 )

X_tok = seqs("tokens", tok_token, MAXLEN_SRC)
X_ner = seqs("ner", tok_ner, MAXLEN_SRC)
X_srl = seqs("srl", tok_srl, MAXLEN_SRC)

Q_in = seqs("question_in", tok_q, MAXLEN_TGT)
Q_out = seqs("question_out", tok_q, MAXLEN_TGT)
A_in = seqs("answer_in", tok_a, MAXLEN_TGT)
A_out = seqs("answer_out", tok_a, MAXLEN_TGT)

y_type = to_categorical(
 np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex["type"] for ex in valid])]),
 num_classes=vocab_type
)

# Expand dims → (batch, seq, 1) agar cocok dgn sparse_cce
Q_out = np.expand_dims(Q_out, -1)
A_out = np.expand_dims(A_out, -1)

In [6]:
(X_tok_tr, X_tok_te,
 X_ner_tr, X_ner_te,
 X_srl_tr, X_srl_te,
 Q_in_tr, Q_in_te,
 Q_out_tr, Q_out_te,
 A_in_tr, A_in_te,
 A_out_tr, A_out_te,
 y_type_tr,y_type_te) = train_test_split(
 X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,
 test_size=0.2, random_state=42
 )
 

In [7]:
enc_tok = Input(shape=(None,), name="enc_tok")
enc_ner = Input(shape=(None,), name="enc_ner")
enc_srl = Input(shape=(None,), name="enc_srl")

emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)
emb_ner = Embedding(vocab_ner, 16, mask_zero=True)(enc_ner)
emb_srl = Embedding(vocab_srl, 16, mask_zero=True)(enc_srl)

enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])
enc_out, state_h, state_c = Bidirectional(
 LSTM(256, return_state=True, return_sequences=False)
)(enc_cat)

# ---------- Klasifikasi tipe ----------
type_out = Dense(vocab_type, activation="softmax", name="type_output")(enc_out)

# ---------- Decoder QUESTION ----------
dec_q_in = Input(shape=(None,), name="dec_q_in")
dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)
dec_q_lstm = LSTM(256, return_sequences=True)
dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])
q_out = TimeDistributed(Dense(vocab_q, activation="softmax"), name="question_output")(dec_q_out)

# ---------- Decoder ANSWER ----------
dec_a_in = Input(shape=(None,), name="dec_a_in")
dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)
dec_a_lstm = LSTM(256, return_sequences=True)
dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])
a_out = TimeDistributed(Dense(vocab_a, activation="softmax"), name="answer_output")(dec_a_out)

# ---------- Build & compile ----------
model = Model(
 inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],
 outputs=[q_out, a_out, type_out]
)

model.compile(
 optimizer="adam",
 loss={
 "question_output": "sparse_categorical_crossentropy",
 "answer_output" : "sparse_categorical_crossentropy",
 "type_output" : "categorical_crossentropy"
 },
 loss_weights={
 "question_output": 1.0,
 "answer_output" : 1.0,
 "type_output" : 0.3
 },
 metrics={
 "question_output": "accuracy",
 "answer_output" : "accuracy",
 "type_output" : "accuracy"
 }
)

model.summary()

2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


ValueError: too many values to unpack (expected 3)

In [None]:
early = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(
 [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],
 {"question_output": Q_out_tr,
 "answer_output" : A_out_tr,
 "type_output" : y_type_tr},
 batch_size=BATCH,
 epochs=EPOCHS,
 validation_split=0.1,
 callbacks=[early]
)

# -------------------------------------------------
# 8. Simpan model & tokenizer
# -------------------------------------------------
model.save("qg_multitask.keras")
with open("tokenizers.pkl", "wb") as f:
 pickle.dump({
 "token": tok_token,
 "ner" : tok_ner,
 "srl" : tok_srl,
 "q" : tok_q,
 "a" : tok_a,
 "type" : tok_type
 }, f)