333 lines
11 KiB
Python
333 lines
11 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import json
|
|
import random
|
|
import tensorflow as tf
|
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
|
from tensorflow.keras.models import Model, load_model
|
|
from tensorflow.keras.layers import (
|
|
Input,
|
|
LSTM,
|
|
Dense,
|
|
Embedding,
|
|
Bidirectional,
|
|
Concatenate,
|
|
Attention,
|
|
Dropout,
|
|
)
|
|
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
|
from sklearn.model_selection import train_test_split
|
|
import matplotlib.pyplot as plt
|
|
import re
|
|
import string
|
|
from collections import Counter
|
|
|
|
|
|
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
|
|
data = json.load(f)
|
|
|
|
|
|
# Preprocessing function
|
|
def preprocess_text(text):
|
|
"""Melakukan preprocessing teks dasar"""
|
|
text = text.lower()
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
return text
|
|
|
|
|
|
# Persiapkan data untuk model
|
|
def prepare_data(data):
|
|
"""Siapkan data untuk model"""
|
|
contexts = []
|
|
tokens_list = []
|
|
ner_list = []
|
|
srl_list = []
|
|
questions = []
|
|
answers = []
|
|
q_types = []
|
|
|
|
for item in data:
|
|
for qa in item["qas"]:
|
|
contexts.append(preprocess_text(item["context"]))
|
|
tokens_list.append(item["tokens"])
|
|
ner_list.append(item["ner"])
|
|
srl_list.append(item["srl"])
|
|
questions.append(preprocess_text(qa["question"]))
|
|
answers.append(qa["answer"])
|
|
q_types.append(qa["type"])
|
|
|
|
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
|
|
|
|
|
|
# Siapkan data
|
|
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
|
|
data
|
|
)
|
|
|
|
# Tokenizer untuk teks (context dan question)
|
|
max_vocab_size = 10000
|
|
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
|
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
|
|
vocab_size = len(tokenizer.word_index) + 1
|
|
|
|
# Encoding untuk NER
|
|
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
|
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
|
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
|
|
|
# Encoding untuk SRL
|
|
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
|
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
|
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
|
|
|
# Encoding untuk tipe pertanyaan
|
|
q_type_tokenizer = Tokenizer()
|
|
q_type_tokenizer.fit_on_texts(q_types)
|
|
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
|
|
|
|
|
# Konversi token, ner, srl ke sequences
|
|
def tokens_to_sequences(tokens, ner, srl):
|
|
"""Konversi token, ner, dan srl ke sequences"""
|
|
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
|
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
|
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
|
return token_seqs, ner_seqs, srl_seqs
|
|
|
|
|
|
# Menentukan panjang maksimum untuk padding
|
|
context_seqs = tokenizer.texts_to_sequences(contexts)
|
|
question_seqs = tokenizer.texts_to_sequences(questions)
|
|
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
|
|
|
max_context_len = max([len(seq) for seq in context_seqs])
|
|
max_question_len = max([len(seq) for seq in question_seqs])
|
|
max_token_len = max([len(seq) for seq in token_seqs])
|
|
|
|
|
|
# Pad sequences untuk memastikan semua input sama panjang
|
|
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
|
|
"""Padding semua sequences"""
|
|
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
|
question_padded = pad_sequences(
|
|
question_seqs, maxlen=max_question_len, padding="post"
|
|
)
|
|
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
|
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
|
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
|
return context_padded, question_padded, token_padded, ner_padded, srl_padded
|
|
|
|
|
|
# Siapkan encoder untuk jawaban
|
|
answer_tokenizer = Tokenizer(oov_token="<OOV>")
|
|
answer_tokenizer.fit_on_texts(answers)
|
|
answer_vocab_size = len(answer_tokenizer.word_index) + 1
|
|
|
|
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
|
|
q_type_indices = []
|
|
for q_type in q_types:
|
|
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
|
|
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
|
q_type_indices.append(q_type_idx)
|
|
|
|
# Konversi ke numpy array
|
|
q_type_indices = np.array(q_type_indices)
|
|
|
|
# One-hot encode tipe pertanyaan
|
|
q_type_categorical = tf.keras.utils.to_categorical(
|
|
q_type_indices, num_classes=q_type_vocab_size
|
|
)
|
|
|
|
# Pad sequences
|
|
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
|
|
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
|
|
)
|
|
|
|
# Encode jawaban
|
|
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
|
|
max_answer_len = max([len(seq) for seq in answer_seqs])
|
|
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
|
|
|
|
# Split data menjadi train dan test sets
|
|
indices = list(range(len(context_padded)))
|
|
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
|
|
|
|
|
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
|
def get_subset(data, indices):
|
|
return np.array([data[i] for i in indices])
|
|
|
|
|
|
# Train data
|
|
train_context = get_subset(context_padded, train_indices)
|
|
train_question = get_subset(question_padded, train_indices)
|
|
train_token = get_subset(token_padded, train_indices)
|
|
train_ner = get_subset(ner_padded, train_indices)
|
|
train_srl = get_subset(srl_padded, train_indices)
|
|
train_q_type = get_subset(q_type_categorical, train_indices)
|
|
train_answer = get_subset(answer_padded, train_indices)
|
|
|
|
# Test data
|
|
test_context = get_subset(context_padded, test_indices)
|
|
test_question = get_subset(question_padded, test_indices)
|
|
test_token = get_subset(token_padded, test_indices)
|
|
test_ner = get_subset(ner_padded, test_indices)
|
|
test_srl = get_subset(srl_padded, test_indices)
|
|
test_q_type = get_subset(q_type_categorical, test_indices)
|
|
test_answer = get_subset(answer_padded, test_indices)
|
|
|
|
# Hyperparameters
|
|
embedding_dim = 100
|
|
lstm_units = 128
|
|
ner_embedding_dim = 50
|
|
srl_embedding_dim = 50
|
|
dropout_rate = 0.3
|
|
|
|
|
|
# Function untuk membuat model
|
|
def create_qa_model():
|
|
# Input layers
|
|
context_input = Input(shape=(max_context_len,), name="context_input")
|
|
question_input = Input(shape=(max_question_len,), name="question_input")
|
|
token_input = Input(shape=(max_token_len,), name="token_input")
|
|
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
|
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
|
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
|
|
|
# Shared embedding layer for text
|
|
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
|
|
|
# Embedding untuk NER dan SRL
|
|
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
|
ner_input
|
|
)
|
|
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
|
srl_input
|
|
)
|
|
|
|
# Apply embeddings
|
|
context_embed = text_embedding(context_input)
|
|
question_embed = text_embedding(question_input)
|
|
token_embed = text_embedding(token_input)
|
|
|
|
# Bi-directional LSTM untuk context dan token-level features
|
|
context_lstm = Bidirectional(
|
|
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
|
)(context_embed)
|
|
question_lstm = Bidirectional(
|
|
LSTM(lstm_units, return_sequences=True, name="question_lstm")
|
|
)(question_embed)
|
|
|
|
# Concat token features (tokens, NER, SRL)
|
|
token_features = Concatenate(name="token_features")(
|
|
[token_embed, ner_embedding, srl_embedding]
|
|
)
|
|
token_lstm = Bidirectional(
|
|
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
|
)(token_features)
|
|
|
|
# Attention mechanism untuk context dengan memperhatikan question
|
|
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
|
[context_lstm, question_lstm]
|
|
)
|
|
|
|
# Pool attention outputs
|
|
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
|
context_attention
|
|
)
|
|
question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")(
|
|
question_lstm
|
|
)
|
|
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
|
|
|
# Concat all features
|
|
all_features = Concatenate(name="all_features")(
|
|
[context_att_pool, question_pool, token_pool, q_type_input]
|
|
)
|
|
|
|
# Dense layers
|
|
x = Dense(256, activation="relu", name="dense_1")(all_features)
|
|
x = Dropout(dropout_rate)(x)
|
|
x = Dense(128, activation="relu", name="dense_2")(x)
|
|
x = Dropout(dropout_rate)(x)
|
|
|
|
# Output layer untuk jawaban
|
|
answer_output = Dense(
|
|
answer_vocab_size, activation="softmax", name="answer_output"
|
|
)(x)
|
|
|
|
# Create model
|
|
model = Model(
|
|
inputs=[
|
|
context_input,
|
|
question_input,
|
|
token_input,
|
|
ner_input,
|
|
srl_input,
|
|
q_type_input,
|
|
],
|
|
outputs=answer_output,
|
|
)
|
|
|
|
# Compile model
|
|
model.compile(
|
|
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
|
)
|
|
|
|
return model
|
|
|
|
|
|
# Buat model
|
|
model = create_qa_model()
|
|
model.summary()
|
|
|
|
# Callback untuk menyimpan model terbaik
|
|
checkpoint = ModelCheckpoint(
|
|
"qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1
|
|
)
|
|
|
|
early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
|
|
|
|
# Training
|
|
batch_size = 8
|
|
epochs = 50
|
|
|
|
# Ubah format jawaban untuk sparse categorical crossentropy
|
|
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
|
|
test_answer_labels = test_answer[:, 0]
|
|
|
|
# Train model
|
|
history = model.fit(
|
|
[train_context, train_question, train_token, train_ner, train_srl, train_q_type],
|
|
train_answer_labels,
|
|
batch_size=batch_size,
|
|
epochs=epochs,
|
|
validation_data=(
|
|
[test_context, test_question, test_token, test_ner, test_srl, test_q_type],
|
|
test_answer_labels,
|
|
),
|
|
callbacks=[checkpoint, early_stop],
|
|
)
|
|
|
|
|
|
# Simpan model dan tokenizer
|
|
model.save("qa_lstm_model_final.h5")
|
|
|
|
# Simpan tokenizer
|
|
tokenizer_data = {
|
|
"word_tokenizer": tokenizer.to_json(),
|
|
"ner_tokenizer": ner_tokenizer.to_json(),
|
|
"srl_tokenizer": srl_tokenizer.to_json(),
|
|
"answer_tokenizer": answer_tokenizer.to_json(),
|
|
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
|
"max_context_len": max_context_len,
|
|
"max_question_len": max_question_len,
|
|
"max_token_len": max_token_len,
|
|
}
|
|
|
|
with open("qa_tokenizers.json", "w") as f:
|
|
json.dump(tokenizer_data, f)
|
|
|
|
print("Model dan tokenizer berhasil disimpan!")
|