In [1]:
import json
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense
from tensorflow.keras.callbacks import EarlyStopping

2025-04-23 14:22:17.809700: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-23 14:22:17.810231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-23 14:22:17.812492: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-23 14:22:17.818482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745392937.829027 39341 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745392937.8322

In [2]:
with open("dataset_qc.json", encoding="utf-8") as f:
 raw_data = json.load(f)

tokens = [[t.lower().strip() for t in item["tokens"]] for item in raw_data]
ner_tags = [item["ner"] for item in raw_data]
srl_tags = [item["srl"] for item in raw_data]
questions = [item["question"].lower().strip() for item in raw_data]
answers = [item["answer"].lower().strip() for item in raw_data]
types = [item["type"] for item in raw_data]

In [3]:
token_tokenizer = Tokenizer(lower=False, oov_token="")
token_tokenizer.fit_on_texts(tokens)
token_sequences = token_tokenizer.texts_to_sequences(tokens)

ner_encoder = LabelEncoder()
srl_encoder = LabelEncoder()

flat_ner = [tag for seq in ner_tags for tag in seq]
flat_srl = [tag for seq in srl_tags for tag in seq]

ner_encoder.fit(flat_ner)
srl_encoder.fit(flat_srl)

ner_sequences = [ner_encoder.transform(seq).tolist() for seq in ner_tags]
srl_sequences = [srl_encoder.transform(seq).tolist() for seq in srl_tags]

In [4]:
MAX_LEN = max(len(seq) for seq in token_sequences)

token_padded = pad_sequences(token_sequences, maxlen=MAX_LEN, padding='post')
ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LEN, padding='post')
srl_padded = pad_sequences(srl_sequences, maxlen=MAX_LEN, padding='post')

In [5]:
qa_tokenizer = Tokenizer(oov_token="")
qa_tokenizer.fit_on_texts(questions + answers)

question_sequences = qa_tokenizer.texts_to_sequences(questions)
answer_sequences = qa_tokenizer.texts_to_sequences(answers)

question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post')
answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post')


type_encoder = LabelEncoder()
type_labels = type_encoder.fit_transform(types) # bentuk 1D array


In [6]:
X_token = token_tokenizer
X_ner = ner_encoder
X_srl = srl_encoder
y_question = qa_tokenizer
y_answer = answer_padded
y_type = type_labels



MAX_LEN = X_token.shape[1]

# ======================
# 2. Parameter
# ======================
VOCAB_TOKEN = np.max(X_token) + 1
VOCAB_NER = np.max(X_ner) + 1
VOCAB_SRL = np.max(X_srl) + 1
VOCAB_QA = max(np.max(y_question), np.max(y_answer)) + 1
NUM_TYPES = len(np.unique(y_type))

EMB_TOKEN = 128
EMB_TAG = 16
LSTM_UNITS = 256

AttributeError: 'Tokenizer' object has no attribute 'shape'

In [None]:
input_token = Input(shape=(MAX_LEN,), name="token_input")
input_ner = Input(shape=(MAX_LEN,), name="ner_input")
input_srl = Input(shape=(MAX_LEN,), name="srl_input")

# ======================
# 4. Embedding
# ======================
embed_token = Embedding(input_dim=VOCAB_TOKEN, output_dim=EMB_TOKEN)(input_token)
embed_ner = Embedding(input_dim=VOCAB_NER, output_dim=EMB_TAG)(input_ner)
embed_srl = Embedding(input_dim=VOCAB_SRL, output_dim=EMB_TAG)(input_srl)

# Gabung semua embedding
merged = Concatenate()([embed_token, embed_ner, embed_srl])

# ======================
# 5. LSTM
# ======================
lstm_out = LSTM(LSTM_UNITS, return_sequences=True)(merged)

# Output: Question
question_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name="question_output")(lstm_out)

# Output: Answer
answer_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name="answer_output")(lstm_out)

# Output: Type (klasifikasi)
type_repr = LSTM(LSTM_UNITS)(merged) # pakai output dari awal sebelum LSTM pertama
type_out = Dense(NUM_TYPES, activation='softmax', name="type_output")(type_repr)


In [None]:
model = Model(inputs=[input_token, input_ner, input_srl],
 outputs=[question_out, answer_out, type_out])

model.compile(
 optimizer='adam',
 loss={
 "question_output": "sparse_categorical_crossentropy",
 "answer_output": "sparse_categorical_crossentropy",
 "type_output": "sparse_categorical_crossentropy",
 },
 metrics={
 "question_output": "accuracy",
 "answer_output": "accuracy",
 "type_output": "accuracy",
 }
)

# ======================
# 7. Training
# ======================
y_question = np.expand_dims(y_question, -1) # untuk sparse categorical loss
y_answer = np.expand_dims(y_answer, -1)

earlystop = EarlyStopping(patience=4, restore_best_weights=True)

model.fit(
 [X_token, X_ner, X_srl],
 [y_question, y_answer, y_type],
 batch_size=32,
 epochs=30,
 validation_split=0.1,
 callbacks=[earlystop]
)

# ======================
# 8. Simpan Model
# ======================
model.save("model_lstm_qg.h5")
print("✅ Training selesai. Model disimpan.")