import numpy as np import pandas as pd import json import random import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Model, load_model from tensorflow.keras.layers import ( Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Attention, Dropout, ) from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import re import string from collections import Counter # Data contoh yang diberikan # data = [ # { # "context": "raden ajeng kartini lahir pada 21 april 1879 di jepara", # "tokens": [ # "raden", "ajeng", "kartini", "lahir", "pada", "21", "april", "1879", "di", "jepara" # ], # "ner": [ # "PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC" # ], # "srl": [ # "ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC" # ], # "qas": [ # { # "type": "isian", # "question": "Dimana kartini lahir ___", # "answer": "jepara", # "id": "qa_0_q1" # }, # { # "type": "true_false", # "question": "Kartini lahir pada tanggal 21 mei 1879 ___", # "options": ["true", "false"], # "answer": "false", # "id": "qa_0_q2" # } # ] # }, # { # "context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan", # "tokens": [ # "kerajaan", "majapahit", "berdiri", "pada", "tahun", "1293", "di", "trowulan" # ], # "ner": [ # "O", "ORG", "O", "O", "O", "DATE", "O", "LOC" # ], # "srl": [ # "ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC" # ], # "qas": [ # { # "type": "opsi", # "question": "Dimana kerajaan majapahit berdiri ___", # "options": ["trowulan", "singasari", "kuta", "banten"], # "answer": "trowulan", # "id": "qa_1_q1" # }, # { # "type": "true_false", # "question": "Kerajaan majapahit berdiri pada tahun 1300 ___", # "options": ["true", "false"], # "answer": "false", # "id": "qa_1_q2" # } # ] # }, # { # "context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945", # "tokens": [ # "soekarno", "dan", "mohammad", "hatta", "memproklamasikan", "kemerdekaan", "indonesia", "pada", "17", "agustus", "1945" # ], # "ner": [ # "PER", "O", "PER", "PER", "O", "O", "LOC", "O", "DATE", "DATE", "DATE" # ], # "srl": [ # "ARG0", "O", "ARG0", "ARG0", "V", "ARG1", "ARGM-LOC", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP" # ], # "qas": [ # { # "type": "isian", # "question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___", # "answer": "17 agustus 1945", # "id": "qa_2_q1" # }, # { # "type": "opsi", # "question": "Siapa yang memproklamasikan kemerdekaan indonesia ___", # "options": ["soekarno", "mohammad hatta", "sudirman", "ahmad yani"], # "answer": "soekarno mohammad hatta", # "id": "qa_2_q2" # } # ] # } # ] with open("data_converted.json", "r") as f: data = json.load(f) # # Simpan ke file JSON untuk kebutuhan di masa depan # with read('qa_dataset.json', 'w', encoding='utf-8') as f: # json.dump(data, f, ensure_ascii=False, indent=2) # Preprocessing function def preprocess_text(text): """Melakukan preprocessing teks dasar""" text = text.lower() text = re.sub(r"\s+", " ", text).strip() return text # Persiapkan data untuk model def prepare_data(data): """Siapkan data untuk model""" contexts = [] tokens_list = [] ner_list = [] srl_list = [] questions = [] answers = [] q_types = [] for item in data: for qa in item["qas"]: contexts.append(preprocess_text(item["context"])) tokens_list.append(item["tokens"]) ner_list.append(item["ner"]) srl_list.append(item["srl"]) questions.append(preprocess_text(qa["question"])) answers.append(qa["answer"]) q_types.append(qa["type"]) return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types # Siapkan data contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data( data ) # Tokenizer untuk teks (context dan question) max_vocab_size = 10000 tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="") tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list]) vocab_size = len(tokenizer.word_index) + 1 # Encoding untuk NER ner_tokenizer = Tokenizer(oov_token="") ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list]) ner_vocab_size = len(ner_tokenizer.word_index) + 1 # Encoding untuk SRL srl_tokenizer = Tokenizer(oov_token="") srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list]) srl_vocab_size = len(srl_tokenizer.word_index) + 1 # Encoding untuk tipe pertanyaan q_type_tokenizer = Tokenizer() q_type_tokenizer.fit_on_texts(q_types) q_type_vocab_size = len(q_type_tokenizer.word_index) + 1 # Konversi token, ner, srl ke sequences def tokens_to_sequences(tokens, ner, srl): """Konversi token, ner, dan srl ke sequences""" token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens] ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner] srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl] return token_seqs, ner_seqs, srl_seqs # Menentukan panjang maksimum untuk padding context_seqs = tokenizer.texts_to_sequences(contexts) question_seqs = tokenizer.texts_to_sequences(questions) token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list) max_context_len = max([len(seq) for seq in context_seqs]) max_question_len = max([len(seq) for seq in question_seqs]) max_token_len = max([len(seq) for seq in token_seqs]) # Pad sequences untuk memastikan semua input sama panjang def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs): """Padding semua sequences""" context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post") question_padded = pad_sequences( question_seqs, maxlen=max_question_len, padding="post" ) token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post") ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post") srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post") return context_padded, question_padded, token_padded, ner_padded, srl_padded # Siapkan encoder untuk jawaban answer_tokenizer = Tokenizer(oov_token="") answer_tokenizer.fit_on_texts(answers) answer_vocab_size = len(answer_tokenizer.word_index) + 1 # Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence q_type_indices = [] for q_type in q_types: # Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1) q_type_idx = q_type_tokenizer.word_index.get(q_type, 0) q_type_indices.append(q_type_idx) # Konversi ke numpy array q_type_indices = np.array(q_type_indices) # One-hot encode tipe pertanyaan q_type_categorical = tf.keras.utils.to_categorical( q_type_indices, num_classes=q_type_vocab_size ) # Pad sequences context_padded, question_padded, token_padded, ner_padded, srl_padded = ( pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs) ) # Encode jawaban answer_seqs = answer_tokenizer.texts_to_sequences(answers) max_answer_len = max([len(seq) for seq in answer_seqs]) answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post") # Split data menjadi train dan test sets indices = list(range(len(context_padded))) train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42) # Fungsi untuk mendapatkan subset dari data berdasarkan indices def get_subset(data, indices): return np.array([data[i] for i in indices]) # Train data train_context = get_subset(context_padded, train_indices) train_question = get_subset(question_padded, train_indices) train_token = get_subset(token_padded, train_indices) train_ner = get_subset(ner_padded, train_indices) train_srl = get_subset(srl_padded, train_indices) train_q_type = get_subset(q_type_categorical, train_indices) train_answer = get_subset(answer_padded, train_indices) # Test data test_context = get_subset(context_padded, test_indices) test_question = get_subset(question_padded, test_indices) test_token = get_subset(token_padded, test_indices) test_ner = get_subset(ner_padded, test_indices) test_srl = get_subset(srl_padded, test_indices) test_q_type = get_subset(q_type_categorical, test_indices) test_answer = get_subset(answer_padded, test_indices) # Hyperparameters embedding_dim = 100 lstm_units = 128 ner_embedding_dim = 50 srl_embedding_dim = 50 dropout_rate = 0.3 # Function untuk membuat model def create_qa_model(): # Input layers context_input = Input(shape=(max_context_len,), name="context_input") question_input = Input(shape=(max_question_len,), name="question_input") token_input = Input(shape=(max_token_len,), name="token_input") ner_input = Input(shape=(max_token_len,), name="ner_input") srl_input = Input(shape=(max_token_len,), name="srl_input") q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input") # Shared embedding layer for text text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding") # Embedding untuk NER dan SRL ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")( ner_input ) srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")( srl_input ) # Apply embeddings context_embed = text_embedding(context_input) question_embed = text_embedding(question_input) token_embed = text_embedding(token_input) # Bi-directional LSTM untuk context dan token-level features context_lstm = Bidirectional( LSTM(lstm_units, return_sequences=True, name="context_lstm") )(context_embed) question_lstm = Bidirectional( LSTM(lstm_units, return_sequences=True, name="question_lstm") )(question_embed) # Concat token features (tokens, NER, SRL) token_features = Concatenate(name="token_features")( [token_embed, ner_embedding, srl_embedding] ) token_lstm = Bidirectional( LSTM(lstm_units, return_sequences=True, name="token_lstm") )(token_features) # Attention mechanism untuk context dengan memperhatikan question context_attention = tf.keras.layers.Attention(name="context_attention")( [context_lstm, question_lstm] ) # Pool attention outputs context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")( context_attention ) question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")( question_lstm ) token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm) # Concat all features all_features = Concatenate(name="all_features")( [context_att_pool, question_pool, token_pool, q_type_input] ) # Dense layers x = Dense(256, activation="relu", name="dense_1")(all_features) x = Dropout(dropout_rate)(x) x = Dense(128, activation="relu", name="dense_2")(x) x = Dropout(dropout_rate)(x) # Output layer untuk jawaban answer_output = Dense( answer_vocab_size, activation="softmax", name="answer_output" )(x) # Create model model = Model( inputs=[ context_input, question_input, token_input, ner_input, srl_input, q_type_input, ], outputs=answer_output, ) # Compile model model.compile( optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] ) return model # Buat model model = create_qa_model() model.summary() # Callback untuk menyimpan model terbaik checkpoint = ModelCheckpoint( "qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1 ) early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1) # Training batch_size = 8 epochs = 50 # Ubah format jawaban untuk sparse categorical crossentropy train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban test_answer_labels = test_answer[:, 0] # Train model history = model.fit( [train_context, train_question, train_token, train_ner, train_srl, train_q_type], train_answer_labels, batch_size=batch_size, epochs=epochs, validation_data=( [test_context, test_question, test_token, test_ner, test_srl, test_q_type], test_answer_labels, ), callbacks=[checkpoint, early_stop], ) # Simpan model dan tokenizer model.save("qa_lstm_model_final.h5") # Simpan tokenizer tokenizer_data = { "word_tokenizer": tokenizer.to_json(), "ner_tokenizer": ner_tokenizer.to_json(), "srl_tokenizer": srl_tokenizer.to_json(), "answer_tokenizer": answer_tokenizer.to_json(), "q_type_tokenizer": q_type_tokenizer.to_json(), "max_context_len": max_context_len, "max_question_len": max_question_len, "max_token_len": max_token_len, } with open("qa_tokenizers.json", "w") as f: json.dump(tokenizer_data, f) print("Model dan tokenizer berhasil disimpan!")