feat: lstm ner done

This commit is contained in:
akhdanre 2025-04-16 10:52:04 +07:00
parent e072974be7
commit fa116924e4
17 changed files with 5187 additions and 234 deletions

93
NER/lstm_ner_qc.py Normal file
View File

@ -0,0 +1,93 @@
import json
import numpy as np
from keras.models import Sequential
from keras.layers import (
Embedding,
LSTM,
Dense,
TimeDistributed,
Bidirectional,
InputLayer,
)
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from seqeval.metrics import classification_report
import pickle
with open("dataset/lstm_ner_dataset.json", "r", encoding="utf-8") as f:
data = json.load(f)
total_bLoc = 0
total_o = 0
total_b_per = 0
total_i_per = 0
for idx, block in enumerate(data, start=1):
for token in block["labels"]:
if token == "B-LOC":
total_bLoc += 1
elif token == "O":
total_o += 1
elif token == "B-PER":
total_b_per += 1
elif token == "I-PER":
total_i_per += 1
print("Total B-LOC:", total_bLoc)
print("Total O:", total_o)
print("Total B-PER:", total_b_per)
print("Total I-PER:", total_i_per)
print("Total B-PER + I-PER:", total_b_per + total_i_per)
sentences = [[token.lower() for token in item["tokens"]] for item in data]
labels = [item["labels"] for item in data]
words = list(set(word for sentence in sentences for word in sentence))
tags = list(set(tag for label_seq in labels for tag in label_seq))
word2idx = {word: idx + 2 for idx, word in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1
tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {i: t for t, i in tag2idx.items()}
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
y = [[tag2idx[t] for t in ts] for ts in labels]
maxlen = max(len(x) for x in X)
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
y = pad_sequences(y, maxlen=maxlen, padding="post", value=tag2idx["O"])
y = [to_categorical(seq, num_classes=len(tag2idx)) for seq in y]
model = Sequential()
model.add(InputLayer(input_shape=(maxlen,)))
model.add(Embedding(input_dim=len(word2idx), output_dim=64))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2idx), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()
model.fit(X, np.array(y), batch_size=2, epochs=10)
model.save("NER/ner_bilstm_model.keras")
with open("NER/word2idx.pkl", "wb") as f:
pickle.dump(word2idx, f)
with open("NER/tag2idx.pkl", "wb") as f:
pickle.dump(tag2idx, f)
y_true = [[idx2tag[np.argmax(token)] for token in seq] for seq in y]
y_pred = model.predict(X)
y_pred = [[idx2tag[np.argmax(token)] for token in seq] for seq in y_pred]
print(classification_report(y_true, y_pred))

BIN
NER/ner_bilstm_model.keras Normal file

Binary file not shown.

BIN
NER/tag2idx.pkl Normal file

Binary file not shown.

39
NER/test_ner.py Normal file
View File

@ -0,0 +1,39 @@
import json
import numpy as np
import pickle
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
model = load_model("NER/ner_bilstm_model.keras")
with open("NER/word2idx.pkl", "rb") as f:
word2idx = pickle.load(f)
with open("NER/tag2idx.pkl", "rb") as f:
tag2idx = pickle.load(f)
idx2tag = {i: t for t, i in tag2idx.items()}
maxlen = 100
def predict_sentence(sentence):
tokens = sentence.strip().split()
x = [word2idx.get(w.lower(), word2idx["UNK"]) for w in tokens]
x = pad_sequences([x], maxlen=maxlen, padding="post", value=word2idx["PAD"])
preds = model.predict(x)
pred_labels = np.argmax(preds[0], axis=-1)
print("Hasil prediksi NER:")
for token, label_idx in zip(tokens, pred_labels[: len(tokens)]):
print(f"{token}\t{idx2tag[label_idx]}")
if __name__ == "__main__":
try:
sentence = "dani datang ke indonesia"
predict_sentence(sentence)
except KeyboardInterrupt:
print("\n\nSelesai.")

BIN
NER/word2idx.pkl Normal file

Binary file not shown.

122
combine_nlp_lstm.py Normal file
View File

@ -0,0 +1,122 @@
import numpy as np
import tensorflow as tf
import spacy
import nltk
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input
from tensorflow.keras.models import Model
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
# === LOAD NLP MODEL ===
nlp = spacy.load("en_core_web_sm")
# === PREPROCESSING FUNCTION ===
def preprocess_text(text):
"""Melakukan Named Entity Recognition dan Dependency Parsing"""
doc = nlp(text)
entities = {ent.text: ent.label_ for ent in doc.ents}
# Print hasil Named Entity Recognition
print("\nNamed Entities Detected:")
for ent, label in entities.items():
print(f"{ent}: {label}")
return entities
# === LSTM MODEL (SEQUENCE-TO-SEQUENCE) ===
embedding_dim = 128
lstm_units = 256
vocab_size = 5000 # Sesuaikan dengan dataset
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
decoder_embedding, initial_state=[state_h, state_c]
)
decoder_dense = Dense(vocab_size, activation="softmax")
output = decoder_dense(decoder_outputs)
# Model
lstm_model = Model([encoder_inputs, decoder_inputs], output)
lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
# === FUNCTION TO GENERATE QUESTION USING LSTM ===
def generate_question_lstm(text, model, tokenizer, max_len=20):
"""Generate soal menggunakan LSTM"""
input_seq = tokenizer.texts_to_sequences([text])
input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_len)
generated_question = []
start_token = tokenizer.word_index.get("<start>", 1)
end_token = tokenizer.word_index.get("<end>", 2)
next_word = start_token
while next_word != end_token and len(generated_question) < max_len:
output = model.predict([input_seq, np.array([next_word])])
next_word = np.argmax(output[0, -1, :])
generated_question.append(tokenizer.index_word.get(next_word, ""))
return " ".join(generated_question)
# === T5 TRANSFORMER MODEL (VERSI TENSORFLOW) ===
t5_model_name = "t5-small"
t5_model = TFT5ForConditionalGeneration.from_pretrained(t5_model_name)
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
def generate_question_t5(text):
"""Generate soal menggunakan T5 Transformer versi TensorFlow"""
input_text = "generate question: " + text
input_ids = t5_tokenizer.encode(
input_text, return_tensors="tf"
) # Gunakan TensorFlow
output = t5_model.generate(input_ids, max_length=50)
return t5_tokenizer.decode(output[0], skip_special_tokens=True)
# === BLEU SCORE EVALUATION ===
def evaluate_bleu(reference, candidate):
"""Menghitung BLEU Score antara pertanyaan asli dan yang dihasilkan"""
score = sentence_bleu([reference.split()], candidate.split())
print(f"BLEU Score: {score:.4f}")
return score
# === MAIN EXECUTION ===
if __name__ == "__main__":
paragraph = "Albert Einstein mengembangkan teori relativitas pada tahun 1905."
# Preprocessing
print("\n🛠️ Preprocessing text...")
entities = preprocess_text(paragraph)
# Generate soal menggunakan LSTM
print("\n🔵 Generating Question using LSTM (Dummy Model)...")
dummy_tokenizer = {
"texts_to_sequences": lambda x: [[1, 2, 3, 4]],
"index_word": {1: "apa", 2: "siapa", 3: "di", 4: "tahun"},
}
question_lstm = generate_question_lstm(paragraph, lstm_model, dummy_tokenizer)
print(f"LSTM Generated Question: {question_lstm}")
# Generate soal menggunakan T5
print("\n🟢 Generating Question using T5 Transformer...")
question_t5 = generate_question_t5(paragraph)
print(f"T5 Generated Question: {question_t5}")
# Evaluasi BLEU Score
reference_question = "Kapan teori relativitas dikembangkan?"
print("\n📊 Evaluating BLEU Score...")
evaluate_bleu(reference_question, question_t5)

34
dataset/README.md Normal file
View File

@ -0,0 +1,34 @@
NER
B-PER -> person kata awal
I-PER -> person kata tengah dan akhir
B-LOC -> awal dari entitas lokasi
I-LOC -> tengah dan akhir dari entitas lokasi
B-ORG -> awal dari entitas organisasi
I-ORG -> tengah dan akhir dari entitas organisasi
B-MISC -> awal dari entitas lain lain Miscellaneous
I-MISC -> Lanjutan dari entitas lain-lain
B-DATE -> tanggal
B-TIME -> waktu
O -> token luar entitas
Semantic Role Labeling (SRL)
ARG0 -> Agen (pelaku) biasanya subjek
ARG1 -> Pasien atau tema objek/yang dikenai aksi
ARG2 -> Arah, tujuan, hasil
ARG3 -> Lokasi awal (sumber)
ARG4 -> Penerima, tujuan akhir
ARG5 -> Alat atau instrumen
ARGM-TMP -> Waktu (Temporal)
ARGM-LOC -> Lokasi (Spatial)
ARGM-MNR -> Cara (Manner)
ARGM-CAU -> Penyebab (Cause)
ARGM-EXT -> Derajat atau perbandingan (Extent)
ARGM-DIS -> Diskursus (Discourse) seperti “tetapi”
ARGM-NEG -> Negasi (Negation), misal "tidak"
ARGM-MOD -> Modality: bisa, harus, mungkin
ARGM-PRP -> Tujuan (Purpose)
ARGM-REC -> Penerima (Recipient, kadang mirip ARG4)
ARGM-COM -> Komitatif (dengan siapa)
ARGM-ADV -> Modifikasi umum

View File

@ -0,0 +1,56 @@
[
{
"tokens": [
"Barack",
"Obama",
"melihat",
"bank",
"di",
"tepi",
"sungai",
"."
],
"ner_labels": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl_labels": [
"B-ARG0",
"I-ARG0",
"B-V",
"B-ARG1",
"B-ARGM-LOC",
"I-ARGM-LOC",
"I-ARGM-LOC",
"O"
],
"predicate": "melihat",
"wsd_targets": [
{
"index": 3,
"word": "bank",
"sense": "river_bank",
"sense_id": "bank%1:17:00::"
}
]
},
{
"tokens": ["Dia", "pergi", "ke", "bank", "untuk", "menabung", "."],
"ner_labels": ["O", "O", "O", "O", "O", "O", "O"],
"srl_labels": [
"B-ARG0",
"B-V",
"B-ARGM-DIR",
"I-ARGM-DIR",
"B-ARGM-PRP",
"I-ARGM-PRP",
"O"
],
"predicate": "pergi",
"wsd_targets": [
{
"index": 3,
"word": "bank",
"sense": "financial_institution",
"sense_id": "bank%1:14:00::"
}
]
}
]

2018
dataset/dataset_ner_srl.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

Binary file not shown.

Binary file not shown.

640
ner_lstm.ipynb Normal file
View File

@ -0,0 +1,640 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/akeon/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /home/akeon/nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package maxent_ne_chunker to\n",
"[nltk_data] /home/akeon/nltk_data...\n",
"[nltk_data] Package maxent_ne_chunker is already up-to-date!\n",
"[nltk_data] Downloading package words to /home/akeon/nltk_data...\n",
"[nltk_data] Package words is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"import string, re, pickle\n",
"import numpy as np\n",
"from nltk.tokenize import word_tokenize\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from sklearn.model_selection import train_test_split\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense\n",
"import spacy\n",
"import nltk\n",
"\n",
"\n",
"nltk.download ('punkt')\n",
"nltk.download ('averaged_perceptron_tagger')\n",
"nltk.download ('maxent_ne_chunker')\n",
"nltk.download ('words')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Kerajaan Aceh PER\n",
"Iskandar Muda PER\n",
"ke-17 MISC\n",
"Islam MISC\n",
"Nusantara LOC\n"
]
}
],
"source": [
"import spacy\n",
"\n",
"# Load model multibahasa yang mendukung Indonesia\n",
"nlp = spacy.load(\"xx_ent_wiki_sm\")\n",
"\n",
"# Contoh teks\n",
"text = \"Kerajaan Aceh mencapai puncak kejayaannya di bawah pemerintahan Sultan Iskandar Muda pada abad ke-17. Aceh menjadi pusat perdagangan dan kebudayaan Islam di wilayah Nusantara.\"\n",
"\n",
"# Proses teks dengan model\n",
"doc = nlp(text)\n",
"\n",
"# Cetak entitas yang dikenali\n",
"for ent in doc.ents:\n",
" print(ent.text, ent.label_)\n",
" \n",
"\n",
"# def generate_ner_context(text):\n",
"# # Load the pretrained spaCy model (small Indo model or use multilingual model if needed)\n",
"# nlp = spacy.load(\"xx_ent_wiki_sm\") # Load multilingual model\n",
" \n",
"# # Process the text\n",
"# doc = nlp(text)\n",
" \n",
"# # Tokenization and Named Entity Recognition (NER)\n",
"# tokens = [token.text for token in doc]\n",
"# ner_tags = []\n",
"# for token in doc:\n",
"# if token.ent_type_:\n",
"# ner_tags.append(f\"B-{token.ent_type_}\")\n",
"# else:\n",
"# ner_tags.append(\"O\")\n",
" \n",
"# return tokens, ner_tags\n",
"\n",
"# # Example input context\n",
"# context = \"Perang Diponegoro berlangsung dari tahun 1825 hingga 1830. Perang ini dipimpin oleh Pangeran Diponegoro melawan pemerintah kolonial Belanda di Jawa Tengah.\"\n",
"\n",
"# # Generate NER and tokens\n",
"# tokens, ner_tags = generate_ner_context(context)\n",
"\n",
"# # Construct the JSON result\n",
"# result = {\n",
"# \"context\": context,\n",
"# \"context_tokens\": tokens,\n",
"# \"context_ner\": ner_tags,\n",
"# \"question_posibility\": [\n",
"# {\n",
"# \"type\": \"true_false\",\n",
"# \"question\": \"Perang Diponegoro berlangsung selama lima tahun.\",\n",
"# \"answer\": \"True\"\n",
"# },\n",
"# {\n",
"# \"type\": \"true_false\",\n",
"# \"question\": \"Perang Diponegoro berakhir pada tahun 1850.\",\n",
"# \"answer\": \"False\"\n",
"# }\n",
"# ]\n",
"# }\n",
"\n",
"# # Output the result\n",
"# import json\n",
"# print(json.dumps(result, indent=4, ensure_ascii=False))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dataset = [\n",
" {\n",
" \"context\": \"Pertempuran Surabaya terjadi pada 10 November 1945 antara pasukan Indonesia melawan pasukan sekutu Inggris yang berusaha mengambil alih kota setelah Jepang menyerah dalam Perang Dunia II. Pertempuran ini dikenang sebagai Hari Pahlawan di Indonesia.\",\n",
" \"context_tokens\": [\n",
" \"Pertempuran\", \"Surabaya\", \"terjadi\", \"pada\", \"10\", \"November\", \"1945\",\n",
" \"antara\", \"pasukan\", \"Indonesia\", \"melawan\", \"pasukan\", \"sekutu\", \"Inggris\",\n",
" \"yang\", \"berusaha\", \"mengambil\", \"alih\", \"kota\", \"setelah\", \"Jepang\", \"menyerah\",\n",
" \"dalam\", \"Perang\", \"Dunia\", \"II\", \".\", \"Pertempuran\", \"ini\", \"dikenang\", \"sebagai\",\n",
" \"Hari\", \"Pahlawan\", \"di\", \"Indonesia\", \".\"\n",
" ],\n",
" \"context_ner\": [\n",
" \"O\", \"B-LOC\", \"O\", \"O\", \"B-DATE\", \"I-DATE\", \"I-DATE\",\n",
" \"O\", \"O\", \"B-LOC\", \"O\", \"O\", \"O\", \"B-LOC\",\n",
" \"O\", \"O\", \"O\", \"O\", \"O\", \"O\", \"B-LOC\", \"O\",\n",
" \"O\", \"B-MISC\", \"I-MISC\", \"I-MISC\", \"O\", \"O\", \"O\", \"O\", \"O\",\n",
" \"O\", \"O\", \"O\", \"B-LOC\", \"O\"\n",
" ],\n",
" \"question_posibility\": [\n",
" {\n",
" \"type\": \"fill_in_the_blank\",\n",
" \"question\": \"Pertempuran Surabaya terjadi pada tanggal _______.\",\n",
" \"answer\": \"10 November 1945\"\n",
" },\n",
" {\n",
" \"type\": \"multiple_choice\",\n",
" \"question\": \"Pasukan yang dihadapi Indonesia dalam Pertempuran Surabaya berasal dari negara apa?\",\n",
" \"options\": [\"Jepang\", \"Belanda\", \"Inggris\", \"Australia\"],\n",
" \"answer\": \"Inggris\"\n",
" },\n",
" {\n",
" \"type\": \"true_false\",\n",
" \"question\": \"Pertempuran Surabaya diperingati sebagai Hari Pahlawan di Indonesia.\",\n",
" \"answer\": \"True\"\n",
" }\n",
" ]\n",
" },\n",
" {\n",
" \"context\": \"Perang Diponegoro berlangsung dari tahun 1825 hingga 1830. Perang ini dipimpin oleh Pangeran Diponegoro melawan pemerintah kolonial Belanda di Jawa Tengah.\",\n",
" \"context_tokens\": [\n",
" \"Perang\", \"Diponegoro\", \"berlangsung\", \"dari\", \"tahun\", \"1825\", \"hingga\", \"1830\", \".\",\n",
" \"Perang\", \"ini\", \"dipimpin\", \"oleh\", \"Pangeran\", \"Diponegoro\", \"melawan\",\n",
" \"pemerintah\", \"kolonial\", \"Belanda\", \"di\", \"Jawa\", \"Tengah\", \".\"\n",
" ],\n",
" \"context_ner\": [\n",
" \"O\", \"B-PER\", \"O\", \"O\", \"O\", \"B-DATE\", \"O\", \"B-DATE\", \"O\",\n",
" \"O\", \"O\", \"O\", \"O\", \"B-PER\", \"I-PER\", \"O\",\n",
" \"O\", \"O\", \"B-LOC\", \"O\", \"O\", \"B-LOC\", \"O\"\n",
" ],\n",
" \"question_posibility\": [\n",
" {\n",
" \"type\": \"true_false\",\n",
" \"question\": \"Perang Diponegoro berlangsung selama lima tahun.\",\n",
" \"answer\": \"True\"\n",
" },\n",
" {\n",
" \"type\": \"true_false\",\n",
" \"question\": \"Perang Diponegoro berakhir pada tahun 1850.\",\n",
" \"answer\": \"False\"\n",
" }\n",
" ]\n",
" }\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"contexts_tokens = []\n",
"contexts_ner = []\n",
"questions = []\n",
"answers = []\n",
"qtypes = []\n",
"\n",
"for entry in dataset:\n",
" contexts_tokens.append(entry[\"context_tokens\"])\n",
" contexts_ner.append(entry[\"context_ner\"])\n",
" qa = entry[\"question_posibility\"][0] # pilih soal pertama\n",
" questions.append(qa[\"question\"])\n",
" answers.append(qa[\"answer\"])\n",
" qtypes.append(qa[\"type\"]) # misalnya \"fill_in_the_blank\"\n",
"\n",
"# ----------------------------\n",
"# Tokenisasi untuk Kata\n",
"# ----------------------------\n",
"# Kita gabungkan semua teks dari context (dari tokens), pertanyaan, dan jawaban\n",
"all_texts = []\n",
"for tokens in contexts_tokens:\n",
" all_texts.append(\" \".join(tokens))\n",
"all_texts += questions\n",
"all_texts += answers\n",
"\n",
"tokenizer = Tokenizer(oov_token=\"<OOV>\")\n",
"tokenizer.fit_on_texts(all_texts)\n",
"\n",
"# Ubah context_tokens menjadi sequence angka\n",
"context_sequences = [tokenizer.texts_to_sequences([\" \".join(tokens)])[0] for tokens in contexts_tokens]\n",
"question_sequences = tokenizer.texts_to_sequences(questions)\n",
"answer_sequences = tokenizer.texts_to_sequences(answers)\n",
"\n",
"# Padding sequence ke panjang tetap\n",
"MAX_LENGTH = 50 # sesuaikan dengan panjang teks maksimum yang diinginkan\n",
"context_padded = pad_sequences(context_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n",
"question_padded = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n",
"answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# ----------------------------\n",
"# Tokenisasi untuk Label NER\n",
"# ----------------------------\n",
"# Kumpulkan semua label NER untuk membangun mapping label ke indeks\n",
"all_ner_labels = []\n",
"for ner_seq in contexts_ner:\n",
" all_ner_labels += ner_seq\n",
"\n",
"ner_labels_set = sorted(list(set(all_ner_labels)))\n",
"# Contoh: ['B-DATE', 'B-LOC', 'B-MISC', 'B-PER', 'I-DATE', 'I-MISC', 'I-PER', 'O']\n",
"ner2idx = {label: idx for idx, label in enumerate(ner_labels_set)}\n",
"idx2ner = {idx: label for label, idx in ner2idx.items()}\n",
"\n",
"# Ubah label NER ke dalam bentuk sequence angka\n",
"ner_sequences = []\n",
"for ner_seq in contexts_ner:\n",
" seq = [ner2idx[label] for label in ner_seq]\n",
" ner_sequences.append(seq)\n",
"\n",
"# Padding sequence label NER (gunakan nilai default misal label \"O\")\n",
"ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\", value=ner2idx[\"O\"])\n",
"\n",
"# ----------------------------\n",
"# Label Tipe Soal\n",
"# ----------------------------\n",
"qtype_dict = {\"fill_in_the_blank\": 0, \"true_false\": 1, \"multiple_choice\": 2}\n",
"qtype_labels = np.array([qtype_dict[q] for q in qtypes])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# ----------------------------\n",
"# Split Data Training dan Validation\n",
"# ----------------------------\n",
"(context_train, context_val, \n",
" question_train, question_val, \n",
" answer_train, answer_val, \n",
" qtype_train, qtype_val,\n",
" ner_train, ner_val) = train_test_split(\n",
" context_padded, question_padded, answer_padded, qtype_labels, ner_padded,\n",
" test_size=0.2, random_state=42\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-03-23 15:06:59.338033: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional\"</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1mModel: \"functional\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ context_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_decoder_i… │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ context_embedding │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">300</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">15,600</span> │ context_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ not_equal │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ context_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_embedding │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">300</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">15,600</span> │ question_decoder… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ encoder_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">570,368</span> │ context_embeddin… │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ not_equal[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_lstm │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">570,368</span> │ question_embeddi… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">570,368</span> │ context_embeddin… │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">32,896</span> │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_lstm │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,140,736</span> │ context_embeddin… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Bidirectional</span>) │ │ │ not_equal[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">52</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">13,364</span> │ question_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">52</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">13,364</span> │ answer_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_type_outp… │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">3</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">387</span> │ dense[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">8</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">4,104</span> │ ner_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ not_equal[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ context_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_decoder_i… │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ context_embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m300\u001b[0m) │ \u001b[38;5;34m15,600\u001b[0m │ context_input[\u001b[38;5;34m0\u001b[0m]… │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ not_equal │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ context_input[\u001b[38;5;34m0\u001b[0m]… │\n",
"│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m300\u001b[0m) │ \u001b[38;5;34m15,600\u001b[0m │ question_decoder… │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m570,368\u001b[0m │ context_embeddin… │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_lstm │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m570,368\u001b[0m │ question_embeddi… │\n",
"│ (\u001b[38;5;33mLSTM\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m570,368\u001b[0m │ context_embeddin… │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_lstm │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m1,140,736\u001b[0m │ context_embeddin… │\n",
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m52\u001b[0m) │ \u001b[38;5;34m13,364\u001b[0m │ question_lstm[\u001b[38;5;34m0\u001b[0m]… │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m52\u001b[0m) │ \u001b[38;5;34m13,364\u001b[0m │ answer_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_type_outp… │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m) │ \u001b[38;5;34m387\u001b[0m │ dense[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m4,104\u001b[0m │ ner_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,947,155</span> (11.24 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,947,155\u001b[0m (11.24 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,947,155</span> (11.24 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,947,155\u001b[0m (11.24 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/10\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-03-23 15:07:06.004502: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 8s/step - answer_output_accuracy: 0.0000e+00 - answer_output_loss: 3.9473 - loss: 11.0828 - ner_output_accuracy: 0.0800 - ner_output_loss: 2.0766 - question_output_accuracy: 0.0400 - question_output_loss: 3.9452 - question_type_output_accuracy: 0.0000e+00 - question_type_output_loss: 1.1138 - val_answer_output_accuracy: 0.3200 - val_answer_output_loss: 3.9260 - val_loss: 11.0343 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 2.0489 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9441 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1153\n",
"Epoch 2/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 300ms/step - answer_output_accuracy: 0.6800 - answer_output_loss: 3.8844 - loss: 10.8637 - ner_output_accuracy: 0.7800 - ner_output_loss: 2.0194 - question_output_accuracy: 0.0800 - question_output_loss: 3.9047 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.0550 - val_answer_output_accuracy: 0.5800 - val_answer_output_loss: 3.8962 - val_loss: 10.9915 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 2.0227 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9453 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1273\n",
"Epoch 3/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 300ms/step - answer_output_accuracy: 0.9000 - answer_output_loss: 3.8076 - loss: 10.6189 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.9522 - question_output_accuracy: 0.0800 - question_output_loss: 3.8585 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.0005 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.8543 - val_loss: 10.9334 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.9867 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9469 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1455\n",
"Epoch 4/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 340ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 3.6877 - loss: 10.2657 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.8489 - question_output_accuracy: 0.0600 - question_output_loss: 3.8010 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.9281 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.7881 - val_loss: 10.8457 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.9324 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9492 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.1760\n",
"Epoch 5/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 330ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 3.4683 - loss: 9.6920 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.6792 - question_output_accuracy: 0.0600 - question_output_loss: 3.7188 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.8258 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.6742 - val_loss: 10.7083 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.8475 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9535 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.2331\n",
"Epoch 6/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 344ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 2.9986 - loss: 8.6406 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.3997 - question_output_accuracy: 0.0400 - question_output_loss: 3.5829 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.6593 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 3.4580 - val_loss: 10.4731 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.7102 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9628 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.3420\n",
"Epoch 7/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 319ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 1.9364 - loss: 6.7078 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.0844 - question_output_accuracy: 0.0400 - question_output_loss: 3.3048 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.3822 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 2.9410 - val_loss: 10.0188 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.5038 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 3.9871 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 1.5870\n",
"Epoch 8/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 318ms/step - answer_output_accuracy: 0.9600 - answer_output_loss: 0.9184 - loss: 4.9883 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.3771 - question_output_accuracy: 0.0400 - question_output_loss: 2.6239 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.0690 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 1.7714 - val_loss: 9.6522 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.4667 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 4.0805 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 2.3336\n",
"Epoch 9/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 286ms/step - answer_output_accuracy: 0.9600 - answer_output_loss: 0.4511 - loss: 3.5983 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.3641 - question_output_accuracy: 0.0400 - question_output_loss: 1.7815 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 0.0015 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 0.8089 - val_loss: 11.4588 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.5131 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 4.6062 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 4.5306\n",
"Epoch 10/10\n",
"\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 304ms/step - answer_output_accuracy: 0.9400 - answer_output_loss: 0.3244 - loss: 2.9690 - ner_output_accuracy: 0.7800 - ner_output_loss: 1.1538 - question_output_accuracy: 0.0600 - question_output_loss: 1.4906 - question_type_output_accuracy: 1.0000 - question_type_output_loss: 1.7498e-04 - val_answer_output_accuracy: 0.9800 - val_answer_output_loss: 0.3998 - val_loss: 16.2049 - val_ner_output_accuracy: 0.8600 - val_ner_output_loss: 1.5880 - val_question_output_accuracy: 0.0000e+00 - val_question_output_loss: 6.0197 - val_question_type_output_accuracy: 0.0000e+00 - val_question_type_output_loss: 8.1974\n"
]
}
],
"source": [
"# ----------------------------\n",
"# Parameter Model\n",
"# ----------------------------\n",
"VOCAB_SIZE = len(tokenizer.word_index) + 1\n",
"EMBEDDING_DIM = 300\n",
"LSTM_UNITS = 256\n",
"BATCH_SIZE = 16\n",
"EPOCHS = 10\n",
"NUM_NER_TAGS = len(ner2idx)\n",
"\n",
"# ----------------------------\n",
"# Arsitektur Model Multi-Output\n",
"# ----------------------------\n",
"\n",
"# Encoder: Input context\n",
"context_input = Input(shape=(MAX_LENGTH,), name=\"context_input\")\n",
"context_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"context_embedding\")(context_input)\n",
"encoder_lstm = LSTM(LSTM_UNITS, return_state=True, name=\"encoder_lstm\")\n",
"encoder_output, state_h, state_c = encoder_lstm(context_embedding)\n",
"\n",
"# Branch untuk pembuatan soal (Question Decoder)\n",
"question_decoder_input = Input(shape=(MAX_LENGTH,), name=\"question_decoder_input\")\n",
"question_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"question_embedding\")(question_decoder_input)\n",
"question_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name=\"question_lstm\")\n",
"question_output, _, _ = question_lstm(question_embedding, initial_state=[state_h, state_c])\n",
"question_dense = Dense(VOCAB_SIZE, activation=\"softmax\", name=\"question_output\")(question_output)\n",
"\n",
"# Branch untuk pembuatan jawaban (Answer Decoder)\n",
"answer_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name=\"answer_lstm\")\n",
"answer_output, _, _ = answer_lstm(context_embedding, initial_state=[state_h, state_c])\n",
"answer_dense = Dense(VOCAB_SIZE, activation=\"softmax\", name=\"answer_output\")(answer_output)\n",
"\n",
"# Branch untuk klasifikasi tipe soal\n",
"type_dense = Dense(128, activation=\"relu\")(encoder_output)\n",
"question_type_output = Dense(3, activation=\"softmax\", name=\"question_type_output\")(type_dense)\n",
"\n",
"# Branch untuk NER: Menggunakan context_embedding untuk melakukan sequence tagging\n",
"ner_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, recurrent_dropout=0.1), name=\"ner_lstm\")(context_embedding)\n",
"ner_output = TimeDistributed(Dense(NUM_NER_TAGS, activation=\"softmax\"), name=\"ner_output\")(ner_lstm)\n",
"\n",
"# Gabungkan semua branch dalam satu model multi-output\n",
"model = Model(\n",
" inputs=[context_input, question_decoder_input],\n",
" outputs=[question_dense, answer_dense, question_type_output, ner_output]\n",
")\n",
"\n",
"model.compile(\n",
" optimizer=\"adam\",\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
" \"question_type_output\": \"sparse_categorical_crossentropy\",\n",
" \"ner_output\": \"sparse_categorical_crossentropy\"\n",
" },\n",
" metrics={\n",
" \"question_output\": [\"accuracy\"],\n",
" \"answer_output\": [\"accuracy\"],\n",
" \"question_type_output\": [\"accuracy\"],\n",
" \"ner_output\": [\"accuracy\"]\n",
" }\n",
")\n",
"\n",
"model.summary()\n",
"\n",
"# ----------------------------\n",
"# Training Model\n",
"# ----------------------------\n",
"model.fit(\n",
" [context_train, question_train],\n",
" {\n",
" \"question_output\": question_train,\n",
" \"answer_output\": answer_train,\n",
" \"question_type_output\": qtype_train,\n",
" \"ner_output\": ner_train\n",
" },\n",
" batch_size=BATCH_SIZE,\n",
" epochs=EPOCHS,\n",
" validation_data=(\n",
" [context_val, question_val],\n",
" {\n",
" \"question_output\": question_val,\n",
" \"answer_output\": answer_val,\n",
" \"question_type_output\": qtype_val,\n",
" \"ner_output\": ner_val\n",
" }\n",
" )\n",
")\n",
"\n",
"# Simpan model dan tokenizer bila diperlukan\n",
"model.save(\"lstm_multi_output_ner_model.keras\")\n",
"with open(\"tokenizer.pkl\", \"wb\") as handle:\n",
" pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

File diff suppressed because one or more lines are too long

286
uji.py
View File

@ -1,163 +1,163 @@
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import random
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# import numpy as np
# import pickle
# import tensorflow as tf
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# import nltk
# import random
# import string
# import re
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# Ensure NLTK resources are available
nltk.download("punkt")
nltk.download("stopwords")
# # Ensure NLTK resources are available
# nltk.download("punkt")
# nltk.download("stopwords")
class QuestionGenerator:
def __init__(
self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl"
):
"""
Initializes the QuestionGenerator by loading the trained model and tokenizer.
"""
# Load trained model
self.model = tf.keras.models.load_model(model_path)
# class QuestionGenerator:
# def __init__(
# self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl"
# ):
# """
# Initializes the QuestionGenerator by loading the trained model and tokenizer.
# """
# # Load trained model
# self.model = tf.keras.models.load_model(model_path)
# Load tokenizer
with open(tokenizer_path, "rb") as handle:
self.tokenizer = pickle.load(handle)
# # Load tokenizer
# with open(tokenizer_path, "rb") as handle:
# self.tokenizer = pickle.load(handle)
# Define question type mapping
self.question_type_dict = {
0: "fill_in_the_blank",
1: "true_false",
2: "multiple_choice",
}
# # Define question type mapping
# self.question_type_dict = {
# 0: "fill_in_the_blank",
# 1: "true_false",
# 2: "multiple_choice",
# }
# Load Indonesian stopwords
self.stop_words = set(stopwords.words("indonesian"))
# # Load Indonesian stopwords
# self.stop_words = set(stopwords.words("indonesian"))
# Custom word normalization dictionary
self.normalization_dict = {
"yg": "yang",
"gokil": "kocak",
"kalo": "kalau",
"gue": "saya",
"elo": "kamu",
"nih": "ini",
"trs": "terus",
"tdk": "tidak",
"gmna": "bagaimana",
"tp": "tapi",
"jd": "jadi",
"aja": "saja",
"krn": "karena",
"blm": "belum",
"dgn": "dengan",
"skrg": "sekarang",
"msh": "masih",
"lg": "lagi",
"sy": "saya",
"sm": "sama",
"bgt": "banget",
"dr": "dari",
"kpn": "kapan",
"hrs": "harus",
"cm": "cuma",
"sbnrnya": "sebenarnya",
}
# # Custom word normalization dictionary
# self.normalization_dict = {
# "yg": "yang",
# "gokil": "kocak",
# "kalo": "kalau",
# "gue": "saya",
# "elo": "kamu",
# "nih": "ini",
# "trs": "terus",
# "tdk": "tidak",
# "gmna": "bagaimana",
# "tp": "tapi",
# "jd": "jadi",
# "aja": "saja",
# "krn": "karena",
# "blm": "belum",
# "dgn": "dengan",
# "skrg": "sekarang",
# "msh": "masih",
# "lg": "lagi",
# "sy": "saya",
# "sm": "sama",
# "bgt": "banget",
# "dr": "dari",
# "kpn": "kapan",
# "hrs": "harus",
# "cm": "cuma",
# "sbnrnya": "sebenarnya",
# }
def preprocess_text(self, text):
"""
Preprocesses the input text by:
- Converting to lowercase
- Removing punctuation
- Tokenizing
- Normalizing words
- Removing stopwords
"""
text = text.lower()
text = text.translate(
str.maketrans("", "", string.punctuation)
) # Remove punctuation
text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
tokens = word_tokenize(text) # Tokenization
tokens = [
self.normalization_dict.get(word, word) for word in tokens
] # Normalize words
tokens = [
word for word in tokens if word not in self.stop_words
] # Remove stopwords
return " ".join(tokens)
# def preprocess_text(self, text):
# """
# Preprocesses the input text by:
# - Converting to lowercase
# - Removing punctuation
# - Tokenizing
# - Normalizing words
# - Removing stopwords
# """
# text = text.lower()
# text = text.translate(
# str.maketrans("", "", string.punctuation)
# ) # Remove punctuation
# text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
# tokens = word_tokenize(text) # Tokenization
# tokens = [
# self.normalization_dict.get(word, word) for word in tokens
# ] # Normalize words
# tokens = [
# word for word in tokens if word not in self.stop_words
# ] # Remove stopwords
# return " ".join(tokens)
def sequence_to_text(self, sequence):
"""
Converts a tokenized sequence back into readable text.
"""
return " ".join(
[
self.tokenizer.index_word.get(idx, "<OOV>")
for idx in sequence
if idx != 0
]
)
# def sequence_to_text(self, sequence):
# """
# Converts a tokenized sequence back into readable text.
# """
# return " ".join(
# [
# self.tokenizer.index_word.get(idx, "<OOV>")
# for idx in sequence
# if idx != 0
# ]
# )
def generate_qa_from_paragraph(self, paragraph):
"""
Generates a question, answer, and question type from the given paragraph.
If it's a multiple-choice question, it also returns answer options.
"""
# Preprocess the input paragraph
processed_paragraph = self.preprocess_text(paragraph)
# def generate_qa_from_paragraph(self, paragraph):
# """
# Generates a question, answer, and question type from the given paragraph.
# If it's a multiple-choice question, it also returns answer options.
# """
# # Preprocess the input paragraph
# processed_paragraph = self.preprocess_text(paragraph)
# Convert text to sequence
input_seq = self.tokenizer.texts_to_sequences([processed_paragraph])
input_seq = pad_sequences(input_seq, maxlen=100, padding="post")
# # Convert text to sequence
# input_seq = self.tokenizer.texts_to_sequences([processed_paragraph])
# input_seq = pad_sequences(input_seq, maxlen=100, padding="post")
# Predict question, answer, and type
pred_question, pred_answer, pred_qtype = self.model.predict(
[input_seq, input_seq]
)
# # Predict question, answer, and type
# pred_question, pred_answer, pred_qtype = self.model.predict(
# [input_seq, input_seq]
# )
# Decode predictions
generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1))
generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1))
question_type_index = np.argmax(pred_qtype[0])
generated_qtype = self.question_type_dict[question_type_index]
# # Decode predictions
# generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1))
# generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1))
# question_type_index = np.argmax(pred_qtype[0])
# generated_qtype = self.question_type_dict[question_type_index]
# Handle multiple-choice options
options = None
if generated_qtype == "multiple_choice":
words = processed_paragraph.split()
random.shuffle(words)
distractors = [
word for word in words if word.lower() != generated_answer.lower()
]
options = [generated_answer] + distractors[:3]
random.shuffle(options) # Shuffle options
# # Handle multiple-choice options
# options = None
# if generated_qtype == "multiple_choice":
# words = processed_paragraph.split()
# random.shuffle(words)
# distractors = [
# word for word in words if word.lower() != generated_answer.lower()
# ]
# options = [generated_answer] + distractors[:3]
# random.shuffle(options) # Shuffle options
# Return the generated data
return {
"generated_question": generated_question,
"generated_answer": generated_answer,
"question_type": generated_qtype,
"options": options if generated_qtype == "multiple_choice" else None,
}
# # Return the generated data
# return {
# "generated_question": generated_question,
# "generated_answer": generated_answer,
# "question_type": generated_qtype,
# "options": options if generated_qtype == "multiple_choice" else None,
# }
# Initialize the question generator
qg = QuestionGenerator()
# # Initialize the question generator
# qg = QuestionGenerator()
# Example input paragraph
sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur."
# # Example input paragraph
# sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur."
# Generate question, answer, and type
generated_result = qg.generate_qa_from_paragraph(sample_paragraph)
# # Generate question, answer, and type
# generated_result = qg.generate_qa_from_paragraph(sample_paragraph)
# Print output
print("Generated Question:", generated_result["generated_question"])
print("Generated Answer:", generated_result["generated_answer"])
print("Question Type:", generated_result["question_type"])
if generated_result["options"]:
print("Options:", generated_result["options"])
# # Print output
# print("Generated Question:", generated_result["generated_question"])
# print("Generated Answer:", generated_result["generated_answer"])
# print("Question Type:", generated_result["question_type"])
# if generated_result["options"]:
# print("Options:", generated_result["options"])