TIF_E41211115_lstm-quiz-gen.../old/QC/question_generation_train.py

271 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
qg_pipeline_static.py
~~~~~~~~~~~~~~~~~~~~~
Question Generation EncoderDecoder LSTM
dengan fitur simbolik NER & SRL (pipeline statis).
Datasets:
train.jsonl / valid.jsonl (lihat format di fungsi `load_jsonl`)
"""
import json, random, numpy as np, tensorflow as tf
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
# ------------------------------------------------------------------------------
# 1. UTILITAS DASAR
# ------------------------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
TRAIN_FILE = "QC/dataset_qc.json"
VALID_RATIO = 0.10
MAX_CTX_LEN = 50
MAX_Q_LEN = 30
WORD_EMB_DIM = 128
BATCH = 32
EPOCHS = 15
SPECIALS_WORD = ("<pad>", "<unk>", "<bos>", "<eos>")
SPECIALS_TAG = ("<pad>",)
def load_jsonl(path):
"""Muatt satu file JSONLines. Setiap line = dict."""
records = []
with open(path, encoding="utf-8") as f:
for line in f:
obj = json.loads(line)
records.append(obj)
return records
def build_vocab(list_of_seq, specials):
"""Bangun (token->id, id->token) dict dari kumpulan sekuens."""
counter = Counter(tok for seq in list_of_seq for tok in seq)
itos = list(specials) + [tok for tok, _ in counter.most_common()]
stoi = {tok: i for i, tok in enumerate(itos)}
return stoi, itos
def encode(seq, tbl, max_len):
ids = [tbl.get(tok, tbl["<unk>"]) for tok in seq]
return (ids + [tbl["<pad>"]] * max_len)[:max_len]
# ------------------------------------------------------------------------------
# 2. DATA PREP
# ------------------------------------------------------------------------------
# def prepare_training_data(file_path):
# """Load → build vocab → encode ke numpy array."""
# recs = load_jsonl(file_path)
# ctx, ner, srl, ques = [], [], [], []
# for r in recs:
# ctx.append(r["context_tokens"])
# ner.append(r["ner_tags"])
# srl.append(r["srl_tags"])
# # tambahkan <bos>, <eos>
# ques.append(["<bos>"] + r["question_tokens"] + ["<eos>"])
# # 2.1 vocab
# w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2]) # <pad>,<unk>
# w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD) # 4 specials
# t2i_ner, _ = build_vocab(ner, SPECIALS_TAG)
# t2i_srl, _ = build_vocab(srl, SPECIALS_TAG)
# # 2.2 encode & pad
# X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx])
# X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner])
# X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl])
# Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques]) # bos..last-1
# Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques]) # 2..eos
# return (
# X_tok,
# X_ner,
# X_srl,
# Y_in,
# Y_out,
# w2i_ctx,
# i2w_ctx,
# w2i_q,
# i2w_q,
# t2i_ner,
# t2i_srl,
# )
# --- ganti fungsi lama ---
def prepare_training_data(file_path):
recs = load_jsonl(file_path)
ctx, ner, srl, ques, span_st, span_ed = [], [], [], [], [], []
for r in recs:
tokens = r["tokens"]
ctx.append(tokens) # context_tokens
ner.append(r["ner"])
srl.append(r["srl"])
# --- hitung answer_span otomatis ---
ans_toks = r["answer"].split()
try:
start = next(
i
for i in range(len(tokens))
if tokens[i : i + len(ans_toks)] == ans_toks
)
end = start + len(ans_toks) - 1
except StopIteration:
raise ValueError(
f"Jawaban '{r['answer']}' tidak cocok dengan tokens {tokens}"
)
span_st.append(start)
span_ed.append(end)
# question tokens: tokenisasi sederhana
ques.append(["<bos>"] + r["question"].split() + ["<eos>"])
# ---------- build vocab sama persis ----------
w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2])
w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD)
t2i_ner, _ = build_vocab(ner, SPECIALS_TAG)
t2i_srl, _ = build_vocab(srl, SPECIALS_TAG)
# ---------- encode ----------
X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx])
X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner])
X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl])
Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques])
Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques])
# simpan span bila nanti mau copymechanism
spans = np.array(list(zip(span_st, span_ed))) # (N, 2)
return (
X_tok,
X_ner,
X_srl,
Y_in,
Y_out,
spans,
w2i_ctx,
i2w_ctx,
w2i_q,
i2w_q,
t2i_ner,
t2i_srl,
)
print("> Loading dataset …")
(X_tok, X_ner, X_srl, Y_in, Y_out, w2i_ctx, i2w_ctx, w2i_q, i2w_q, t2i_ner, t2i_srl) = (
prepare_training_data(TRAIN_FILE)
)
train_idx, valid_idx = train_test_split(
np.arange(len(X_tok)), test_size=VALID_RATIO, random_state=SEED
)
def pick(arr, idx):
return arr[idx]
train_data = [pick(a, train_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)]
valid_data = [pick(a, valid_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)]
# ------------------------------------------------------------------------------
# 3. MODEL
# ------------------------------------------------------------------------------
def build_model(vocab_ctx, vocab_q, n_ner, n_srl):
tok_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="tok")
ner_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="ner")
srl_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="srl")
dec_in = tf.keras.layers.Input((MAX_Q_LEN,), name="dec")
tok_emb = tf.keras.layers.Embedding(vocab_ctx, WORD_EMB_DIM, mask_zero=True)(tok_in)
ner_emb = tf.keras.layers.Embedding(n_ner, 32, mask_zero=True)(ner_in)
srl_emb = tf.keras.layers.Embedding(n_srl, 32, mask_zero=True)(srl_in)
enc_in = tf.keras.layers.Concatenate()([tok_emb, ner_emb, srl_emb])
enc_out, fwd_h, fwd_c, bwd_h, bwd_c = tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(WORD_EMB_DIM, return_sequences=True, return_state=True)
)(enc_in)
state_h = tf.keras.layers.Concatenate()([fwd_h, bwd_h])
state_c = tf.keras.layers.Concatenate()([fwd_c, bwd_c])
dec_emb = tf.keras.layers.Embedding(vocab_q, WORD_EMB_DIM, mask_zero=True)(dec_in)
dec_lstm = tf.keras.layers.LSTM(
WORD_EMB_DIM * 2, return_sequences=True, return_state=True
)
dec_out, _, _ = dec_lstm(dec_emb, initial_state=[state_h, state_c])
# Attention (dot)
score = tf.keras.layers.Dot(axes=[2, 2])([dec_out, enc_out])
attn_weights = tf.keras.layers.Activation("softmax")(score)
context_vec = tf.keras.layers.Dot(axes=[2, 1])([attn_weights, enc_out])
dec_cat = tf.keras.layers.Concatenate()([dec_out, context_vec])
outputs = tf.keras.layers.TimeDistributed(
tf.keras.layers.Dense(vocab_q, activation="softmax")
)(dec_cat)
mdl = tf.keras.Model([tok_in, ner_in, srl_in, dec_in], outputs)
mdl.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return mdl
print("> Building model …")
model = build_model(len(w2i_ctx), len(w2i_q), len(t2i_ner), len(t2i_srl))
model.summary(line_length=120)
# ------------------------------------------------------------------------------
# 4. DATA GENERATOR
# ------------------------------------------------------------------------------
def generator(data, batch=BATCH):
X_tok, X_ner, X_srl, Y_inp, Y_outp = data
n = len(X_tok)
while True:
idx = np.random.permutation(n)
for i in range(0, n, batch):
b = idx[i : i + batch]
yield [X_tok[b], X_ner[b], X_srl[b], Y_inp[b]], Y_outp[b][..., None]
steps_train = len(train_idx) // BATCH
steps_valid = len(valid_idx) // BATCH
# ------------------------------------------------------------------------------
# 5. TRAIN
# ------------------------------------------------------------------------------
print("> Training …")
_ = model.fit(
generator(train_data),
steps_per_epoch=steps_train,
validation_data=generator(valid_data),
validation_steps=steps_valid,
epochs=EPOCHS,
)
model.save("qg_lstm_static.h5")
print("✓ Model saved to qg_lstm_static.h5")