Compare commits

...

10 Commits

89 changed files with 371059 additions and 1210 deletions

3
.gitignore vendored
View File

@ -3,5 +3,8 @@ myenv
*keras*
**/*keras*
*h5*
**/*h5*
# Abaikan semua file dengan ekstensi .pkl
*.pkl

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

After

Width:  |  Height:  |  Size: 62 KiB

File diff suppressed because one or more lines are too long

View File

@ -8,20 +8,97 @@ from pathlib import Path
# Daftar label NER yang valid (bisa disesuaikan)
VALID_NER_LABELS = {
"O",
"B-LOC", "I-LOC",
"B-PER", "I-PER",
"B-ORG", "I-ORG",
"B-DATE", "I-DATE",
"B-TIME", "I-TIME",
"B-EVENT", "I-EVENT"
"LOC",
"LOC",
"PER",
"PER",
"ORG",
"ORG",
"DATE",
"DATE",
"TIME",
"TIME",
"EVENT",
"EVENT",
"MISC",
}
# Daftar label NER yang valid (bisa disesuaikan)
VALID_NER_LABELS = {"O", "LOC", "PER", "ORG", "DATE", "TIME", "EVENT", "MISC"}
# Daftar label SRL yang valid
VALID_SRL_LABELS = {
"ARG0",
"ARG1",
"ARG2",
"ARG3",
"ARGM-TMP",
"ARGM-LOC",
"ARGM-CAU",
"ARGM-MNR",
"ARGM-MOD",
"ARGM-NEG",
"V",
"O",
}
# def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
# with open(json_path, encoding="utf-8") as f:
# records = json.load(f)
# seen_sentences: set[tuple[str, ...]] = set()
# with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
# writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
# for idx, rec in enumerate(records):
# contexxt = rec.get("context")
# tokens = rec.get("tokens")
# ner_tags = rec.get("ner")
# srl_tags = rec.get("srl")
# if not (len(tokens) == len(ner_tags) == len(srl_tags)):
# raise ValueError(
# f"❌ Panjang tidak sama di record index {idx}:\n"
# f" context ({len(contexxt)}): {contexxt}\n"
# f" tokens ({len(tokens)}): {tokens}\n"
# f" ner ({len(ner_tags)}): {ner_tags}\n"
# f" srl ({len(srl_tags)}): {srl_tags}\n"
# )
# # Validasi label NER
# for i, ner_label in enumerate(ner_tags):
# if ner_label not in VALID_NER_LABELS:
# raise ValueError(
# f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
# f" ner_label: {ner_label}\n"
# f" value: {tokens}"
# )
# # Validasi label SRL
# for i, srl_label in enumerate(srl_tags):
# if srl_label not in VALID_SRL_LABELS:
# raise ValueError(
# f"❌ Label SRL tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
# f" srl_label: {srl_label}\n"
# f" value: {tokens}"
# )
# key = tuple(tokens)
# if key in seen_sentences:
# continue
# seen_sentences.add(key)
# for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
# writer.writerow([tok, ner, srl])
# writer.writerow([])
# print(f"✔️ TSV selesai, simpan di: {tsv_path}")
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
"""
Konversi data JSON (field: tokens, ner, srl, ) TSV token\tNER\tSRL.
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
Jika ada record yang tokens, ner, dan srl tidak sama panjang, atau ada label NER tidak valid, akan diberi info error lengkap.
"""
with open(json_path, encoding="utf-8") as f:
records = json.load(f)
@ -31,26 +108,46 @@ def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
for idx, rec in enumerate(records):
context = rec.get("context")
tokens = rec.get("tokens")
ner_tags = rec.get("ner")
srl_tags = rec.get("srl")
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
raise ValueError(
print(
f"❌ Panjang tidak sama di record index {idx}:\n"
f" context: {context}\n"
f" tokens ({len(tokens)}): {tokens}\n"
f" ner ({len(ner_tags)}): {ner_tags}\n"
f" srl ({len(srl_tags)}): {srl_tags}\n"
)
continue
# Validasi label NER
invalid_ner = False
for i, ner_label in enumerate(ner_tags):
if ner_label not in VALID_NER_LABELS:
raise ValueError(
print(
f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
f" ner_label: {ner_label}\n"
f" value: {tokens}"
)
invalid_ner = True
break
if invalid_ner:
continue
invalid_srl = False
for i, srl_label in enumerate(srl_tags):
if srl_label not in VALID_SRL_LABELS:
print(
f"❌ Label SRL tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
f" srl_label: {srl_label}\n"
f" value: {tokens}"
)
invalid_srl = True
break
if invalid_srl:
continue
key = tuple(tokens)
if key in seen_sentences:
@ -118,4 +215,4 @@ def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
# CONTOH PEMAKAIAN
# ---------------------------------------------------------------------------
if __name__ == "__main__":
json_to_tsv("QC/normalize_dataset.json", "QC/new_LNS.tsv")
json_to_tsv("../dataset/stable_qg_qa_train_dataset.json", "new_LNS_2.tsv")

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

After

Width:  |  Height:  |  Size: 55 KiB

6782
NER_SRL/new_LNS_2.tsv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

@ -1,8 +1,8 @@
import json
import numpy as np
import pickle
from tensorflow.keras.models import load_model # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from tensorflow.keras.models import load_model # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
import re
# -----------------------------
@ -50,6 +50,6 @@ def predict_sentence(sentence: str) -> dict:
# 3. Demo
# -----------------------------
if __name__ == "__main__":
sample = "batu bata terbuat dari material tanah liat"
sample = "ngaben adalan acara pembakaran jenazah masyarakat suku bali"
result = predict_sentence(sample)
print(json.dumps(result, ensure_ascii=False, indent=2))

Binary file not shown.

Binary file not shown.

View File

@ -1,554 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "9bf2159a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-05-02 15:16:40.916818: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-05-02 15:16:40.923426: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-05-02 15:16:40.983217: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-05-02 15:16:41.024477: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1746173801.069646 9825 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1746173801.081087 9825 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1746173801.169376 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746173801.169393 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746173801.169395 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746173801.169396 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-05-02 15:16:41.179508: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"import json\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from sklearn.model_selection import train_test_split\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.utils import to_categorical\n",
"\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.layers import (\n",
" Input,\n",
" Embedding,\n",
" LSTM,\n",
" Concatenate,\n",
" Dense,\n",
" TimeDistributed,\n",
")\n",
"from tensorflow.keras.callbacks import EarlyStopping\n",
"from sklearn.metrics import classification_report\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "50118278",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Jumlah data valid: 321 / 321\n",
" Jumlah data tidak valid: 0\n",
"Counter({'ftb': 235, 'tof': 45, 'none': 41})\n"
]
}
],
"source": [
"# Load raw data\n",
"with open(\"normalize_dataset.json\", encoding=\"utf-8\") as f:\n",
" raw_data = json.load(f)\n",
"\n",
"# Validasi lengkap\n",
"required_keys = {\"tokens\", \"ner\", \"srl\", \"question\", \"answer\", \"type\"}\n",
"valid_data = []\n",
"invalid_data = []\n",
"\n",
"for idx, item in enumerate(raw_data):\n",
" error_messages = []\n",
"\n",
" if not isinstance(item, dict):\n",
" error_messages.append(\"bukan dictionary\")\n",
"\n",
" missing_keys = required_keys - item.keys()\n",
" if missing_keys:\n",
" error_messages.append(f\"missing keys: {missing_keys}\")\n",
"\n",
" if not error_messages:\n",
" # Cek tipe data dan None\n",
" if (not isinstance(item[\"tokens\"], list) or\n",
" not isinstance(item[\"ner\"], list) or\n",
" not isinstance(item[\"srl\"], list) or\n",
" not isinstance(item[\"question\"], list) or\n",
" not isinstance(item[\"answer\"], list) or\n",
" not isinstance(item[\"type\"], str)):\n",
" error_messages.append(\"field type tidak sesuai\")\n",
" \n",
" if error_messages:\n",
" print(f\"\\n Index {idx} | Masalah: {', '.join(error_messages)}\")\n",
" print(json.dumps(item, indent=2, ensure_ascii=False))\n",
" invalid_data.append(item)\n",
" continue\n",
"\n",
" valid_data.append(item)\n",
"\n",
"# Statistik\n",
"print(f\"\\n Jumlah data valid: {len(valid_data)} / {len(raw_data)}\")\n",
"print(f\" Jumlah data tidak valid: {len(invalid_data)}\")\n",
"\n",
"# Proses data valid\n",
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in valid_data]\n",
"ner_tags = [item[\"ner\"] for item in valid_data]\n",
"srl_tags = [item[\"srl\"] for item in valid_data]\n",
"questions = [[token.lower().strip() for token in item[\"question\"]] for item in valid_data]\n",
"answers = [[token.lower().strip() for token in item[\"answer\"]] for item in valid_data]\n",
"types = [item[\"type\"] for item in valid_data]\n",
"\n",
"type_counts = Counter(types)\n",
"\n",
"print(type_counts)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4e3a0088",
"metadata": {},
"outputs": [],
"source": [
"# tokenize\n",
"token_tok = Tokenizer(lower=False, oov_token=\"UNK\")\n",
"token_ner = Tokenizer(lower=False)\n",
"token_srl = Tokenizer(lower=False)\n",
"token_q = Tokenizer(lower=False)\n",
"token_a = Tokenizer(lower=False)\n",
"token_type = Tokenizer(lower=False)\n",
"\n",
"token_tok.fit_on_texts(tokens)\n",
"token_ner.fit_on_texts(ner_tags)\n",
"token_srl.fit_on_texts(srl_tags)\n",
"token_q.fit_on_texts(questions)\n",
"token_a.fit_on_texts(answers)\n",
"token_type.fit_on_texts(types)\n",
"\n",
"\n",
"maxlen = 20"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "555f9e22",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ftb', 'tof', 'none'}\n"
]
}
],
"source": [
"\n",
"X_tok = pad_sequences(\n",
" token_tok.texts_to_sequences(tokens), padding=\"post\", maxlen=maxlen\n",
")\n",
"X_ner = pad_sequences(\n",
" token_ner.texts_to_sequences(ner_tags), padding=\"post\", maxlen=maxlen\n",
")\n",
"X_srl = pad_sequences(\n",
" token_srl.texts_to_sequences(srl_tags), padding=\"post\", maxlen=maxlen\n",
")\n",
"y_q = pad_sequences(token_q.texts_to_sequences(questions), padding=\"post\", maxlen=maxlen)\n",
"y_a = pad_sequences(token_a.texts_to_sequences(answers), padding=\"post\", maxlen=maxlen)\n",
"\n",
"print(set(types))\n",
"\n",
"y_type = [seq[0] for seq in token_type.texts_to_sequences(types)] # list of int\n",
"y_type = to_categorical(np.array(y_type) - 1, num_classes=len(token_type.word_index))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f530cfe7",
"metadata": {},
"outputs": [],
"source": [
"X_tok_train, X_tok_test, X_ner_train, X_ner_test, X_srl_train, X_srl_test, \\\n",
"y_q_train, y_q_test, y_a_train, y_a_test, y_type_train, y_type_test = train_test_split(\n",
" X_tok, X_ner, X_srl, y_q, y_a, y_type, test_size=0.2, random_state=42\n",
")\n",
"\n",
"X_train = [X_tok_train, X_ner_train, X_srl_train]\n",
"X_test = [X_tok_test, X_ner_test, X_srl_test]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "255e2a9a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-04-29 19:13:22.481835: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional\"</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1mModel: \"functional\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ tok_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ srl_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">126,080</span> │ tok_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">352</span> │ ner_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_2 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">432</span> │ srl_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ concatenate │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">160</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ embedding[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>) │ │ │ embedding_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ │ │ │ embedding_2[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">427,008</span> │ concatenate[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ get_item (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">GetItem</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">473</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">121,561</span> │ lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">383</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">98,431</span> │ lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ type_output (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">3</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">771</span> │ get_item[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ tok_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ srl_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m126,080\u001b[0m │ tok_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m352\u001b[0m │ ner_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_2 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m432\u001b[0m │ srl_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ concatenate │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m160\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ embedding[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ embedding_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ embedding_2[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ lstm (\u001b[38;5;33mLSTM\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m427,008\u001b[0m │ concatenate[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ get_item (\u001b[38;5;33mGetItem\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m473\u001b[0m) │ \u001b[38;5;34m121,561\u001b[0m │ lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m383\u001b[0m) │ \u001b[38;5;34m98,431\u001b[0m │ lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ type_output (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m) │ \u001b[38;5;34m771\u001b[0m │ get_item[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">774,635</span> (2.95 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m774,635\u001b[0m (2.95 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">774,635</span> (2.95 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m774,635\u001b[0m (2.95 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 101ms/step - answer_output_accuracy: 0.5626 - answer_output_loss: 5.7629 - loss: 12.9112 - question_output_accuracy: 0.3867 - question_output_loss: 6.0185 - type_output_accuracy: 0.5290 - type_output_loss: 1.0943 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 3.9036 - val_loss: 9.5865 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 4.5947 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0883\n",
"Epoch 2/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8791 - answer_output_loss: 2.9526 - loss: 7.7800 - question_output_accuracy: 0.6837 - question_output_loss: 3.7162 - type_output_accuracy: 0.7148 - type_output_loss: 1.0672 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 1.1139 - val_loss: 4.1230 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9489 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0601\n",
"Epoch 3/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 34ms/step - answer_output_accuracy: 0.8726 - answer_output_loss: 1.2047 - loss: 4.4213 - question_output_accuracy: 0.6797 - question_output_loss: 2.2016 - type_output_accuracy: 0.7251 - type_output_loss: 1.0092 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.7679 - val_loss: 3.7423 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9604 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0140\n",
"Epoch 4/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8633 - answer_output_loss: 1.1478 - loss: 4.4374 - question_output_accuracy: 0.6639 - question_output_loss: 2.3671 - type_output_accuracy: 0.7490 - type_output_loss: 0.9088 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.7059 - val_loss: 3.6255 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9356 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 0.9840\n",
"Epoch 5/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 33ms/step - answer_output_accuracy: 0.8783 - answer_output_loss: 1.0187 - loss: 4.0230 - question_output_accuracy: 0.6760 - question_output_loss: 2.1959 - type_output_accuracy: 0.7563 - type_output_loss: 0.8131 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6848 - val_loss: 3.5743 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9039 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 0.9857\n",
"Epoch 6/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 33ms/step - answer_output_accuracy: 0.8800 - answer_output_loss: 0.9845 - loss: 3.8171 - question_output_accuracy: 0.6878 - question_output_loss: 2.0357 - type_output_accuracy: 0.7328 - type_output_loss: 0.7942 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6742 - val_loss: 3.5592 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8777 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0074\n",
"Epoch 7/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8768 - answer_output_loss: 0.9756 - loss: 3.8569 - question_output_accuracy: 0.6743 - question_output_loss: 2.0795 - type_output_accuracy: 0.7030 - type_output_loss: 0.8039 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6769 - val_loss: 3.5671 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8631 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0272\n",
"Epoch 8/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8814 - answer_output_loss: 0.9217 - loss: 3.7726 - question_output_accuracy: 0.6798 - question_output_loss: 2.0253 - type_output_accuracy: 0.6785 - type_output_loss: 0.8194 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6900 - val_loss: 3.5722 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8469 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0354\n",
"Epoch 9/30\n",
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 31ms/step - answer_output_accuracy: 0.8703 - answer_output_loss: 0.9799 - loss: 3.6985 - question_output_accuracy: 0.6843 - question_output_loss: 1.9755 - type_output_accuracy: 0.7160 - type_output_loss: 0.7474 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6958 - val_loss: 3.5849 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8401 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0490\n"
]
}
],
"source": [
"\n",
"inp_tok = Input(shape=(None,), name=\"tok_input\")\n",
"inp_ner = Input(shape=(None,), name=\"ner_input\")\n",
"inp_srl = Input(shape=(None,), name=\"srl_input\")\n",
"\n",
"emb_tok = Embedding(input_dim=len(token_tok.word_index) + 1, output_dim=128)(inp_tok)\n",
"emb_ner = Embedding(input_dim=len(token_ner.word_index) + 1, output_dim=16)(inp_ner)\n",
"emb_srl = Embedding(input_dim=len(token_srl.word_index) + 1, output_dim=16)(inp_srl)\n",
"\n",
"# emb_tok = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_tok)\n",
"# emb_ner = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_ner)\n",
"# emb_srl = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_srl)\n",
"\n",
"merged = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
"\n",
"x = LSTM(256, return_sequences=True)(merged)\n",
"\n",
"out_question = TimeDistributed(Dense(len(token_q.word_index) + 1, activation=\"softmax\"), name=\"question_output\")(x)\n",
"out_answer = TimeDistributed(Dense(len(token_a.word_index) + 1, activation=\"softmax\"), name=\"answer_output\")(x)\n",
"out_type = Dense(len(token_type.word_index), activation=\"softmax\", name=\"type_output\")(\n",
" x[:, 0, :]\n",
") # gunakan step pertama\n",
"\n",
"model = Model(\n",
" inputs=[inp_tok, inp_ner, inp_srl], outputs=[out_question, out_answer, out_type]\n",
")\n",
"model.compile(\n",
" optimizer=\"adam\",\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
" \"type_output\": \"categorical_crossentropy\",\n",
" },\n",
" metrics={\n",
" \"question_output\": \"accuracy\",\n",
" \"answer_output\": \"accuracy\",\n",
" \"type_output\": \"accuracy\",\n",
" },\n",
")\n",
"\n",
"model.summary()\n",
"\n",
"# ----------------------------------------------------------------------------\n",
"# 5. TRAINING\n",
"# ----------------------------------------------------------------------------\n",
"model.fit(\n",
" X_train,\n",
" {\n",
" \"question_output\": np.expand_dims(y_q_train, -1),\n",
" \"answer_output\": np.expand_dims(y_a_train, -1),\n",
" \"type_output\": y_type_train,\n",
" },\n",
" batch_size=32,\n",
" epochs=30,\n",
" validation_split=0.1,\n",
" callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],\n",
")\n",
"\n",
"import pickle\n",
"\n",
"\n",
"model.save(\"new_model_lstm_qg.keras\")\n",
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
" pickle.dump({\n",
" \"token\": token_tok,\n",
" \"ner\": token_ner,\n",
" \"srl\": token_srl,\n",
" \"question\": token_q,\n",
" \"answer\": token_a,\n",
" \"type\": token_type\n",
" }, f)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06fd86c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 236ms/step\n",
"\n",
"=== Akurasi Detail ===\n",
"Question Accuracy (Token-level): 0.0000\n",
"Answer Accuracy (Token-level) : 0.0000\n",
"Type Accuracy (Class-level) : 0.68\n"
]
}
],
"source": [
"\n",
"def token_level_accuracy(y_true, y_pred):\n",
" correct = 0\n",
" total = 0\n",
" for true_seq, pred_seq in zip(y_true, y_pred):\n",
" for t, p in zip(true_seq, pred_seq):\n",
" if t != 0: # ignore padding\n",
" total += 1\n",
" if t == p:\n",
" correct += 1\n",
" return correct / total if total > 0 else 0\n",
"\n",
"\n",
"# Predict on test set\n",
"y_pred_q, y_pred_a, y_pred_type = model.predict(X_test)\n",
"\n",
"# Decode predictions to class indices\n",
"y_pred_q = np.argmax(y_pred_q, axis=-1)\n",
"y_pred_a = np.argmax(y_pred_a, axis=-1)\n",
"y_pred_type = np.argmax(y_pred_type, axis=-1)\n",
"y_true_type = np.argmax(y_type_test, axis=-1)\n",
"\n",
"# Calculate token-level accuracy\n",
"acc_q = token_level_accuracy(y_q_test, y_pred_q)\n",
"acc_a = token_level_accuracy(y_a_test, y_pred_a)\n",
"\n",
"# Type classification report\n",
"report_type = classification_report(y_true_type, y_pred_type, zero_division=0)\n",
"\n",
"# Print Results\n",
"print(\"\\n=== Akurasi Detail ===\")\n",
"print(f\"Question Accuracy (Token-level): {acc_q:.4f}\")\n",
"print(f\"Answer Accuracy (Token-level) : {acc_a:.4f}\")\n",
"print(f\"Type Accuracy (Class-level) : {np.mean(y_true_type == y_pred_type):.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d5ed106c",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# flat_true_a, flat_pred_a = flatten_valid(y_a_test, y_pred_a_class)\n",
"# print(\"\\n=== Classification Report: ANSWER ===\")\n",
"# print(classification_report(flat_true_a, flat_pred_a))\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aa3860de",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# print(\"\\n=== Classification Report: TYPE ===\")\n",
"# print(classification_report(y_true_type_class, y_pred_type_class))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

43404
dataset/dev_dataset_qg.json Normal file

File diff suppressed because it is too large Load Diff

3785
dataset/invalid_data.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

10951
dataset/new_dataset.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1007
dataset/out/test_dts.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

174
dataset/py_cleanup.py Normal file
View File

@ -0,0 +1,174 @@
import json
def validate_and_sort_data(input_data, valid_output_file='valid_data.json', invalid_output_file='invalid_data.json'):
"""
Memvalidasi dan mengurutkan data berdasarkan konsistensi panjang token, NER, dan SRL
Args:
input_data: List data yang akan divalidasi
valid_output_file: Nama file untuk data yang valid
invalid_output_file: Nama file untuk data yang tidak valid
"""
valid_data = []
invalid_data = []
for i, item in enumerate(input_data):
# Cek apakah semua field yang diperlukan ada
required_fields = ['context', 'tokens', 'ner', 'srl']
missing_fields = [field for field in required_fields if field not in item]
if missing_fields:
item['validation_error'] = f"Missing fields: {missing_fields}"
invalid_data.append(item)
continue
# Cek panjang konsistensi
tokens_len = len(item['tokens'])
ner_len = len(item['ner'])
srl_len = len(item['srl'])
# Validasi panjang
if tokens_len == ner_len == srl_len:
# Data valid
item['validation_status'] = 'valid'
item['token_count'] = tokens_len
valid_data.append(item)
else:
# Data tidak valid
item['validation_status'] = 'invalid'
item['validation_error'] = {
'tokens_length': tokens_len,
'ner_length': ner_len,
'srl_length': srl_len,
'issue': 'Length mismatch between tokens, NER, and SRL'
}
invalid_data.append(item)
# Urutkan data valid berdasarkan jumlah token (ascending)
valid_data.sort(key=lambda x: x['token_count'])
# Simpan ke file JSON
with open(valid_output_file, 'w', encoding='utf-8') as f:
json.dump(valid_data, f, ensure_ascii=False, indent=2)
with open(invalid_output_file, 'w', encoding='utf-8') as f:
json.dump(invalid_data, f, ensure_ascii=False, indent=2)
# Print statistik
print(f"=== HASIL VALIDASI DATA ===")
print(f"Total data: {len(input_data)}")
print(f"Data valid: {len(valid_data)}")
print(f"Data tidak valid: {len(invalid_data)}")
print(f"\nFile output:")
print(f"- Data valid: {valid_output_file}")
print(f"- Data tidak valid: {invalid_output_file}")
if invalid_data:
print(f"\n=== DETAIL DATA TIDAK VALID ===")
for i, item in enumerate(invalid_data):
if 'validation_error' in item:
if isinstance(item['validation_error'], dict):
error = item['validation_error']
print(f"Data {i+1}: {error['issue']}")
print(f" - Tokens: {error['tokens_length']}")
print(f" - NER: {error['ner_length']}")
print(f" - SRL: {error['srl_length']}")
else:
print(f"Data {i+1}: {item['validation_error']}")
print()
return valid_data, invalid_data
def load_data_from_file(file_path):
"""
Memuat data dari file JSON
Args:
file_path: Path ke file JSON
Returns:
List data
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"File {file_path} tidak ditemukan!")
return []
except json.JSONDecodeError:
print(f"Error parsing JSON dari file {file_path}")
return []
# Contoh penggunaan
if __name__ == "__main__":
# Data contoh dari input Anda
sample_data = [
{
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
"tokens": [
"raden", "ajeng", "kartini", "lahir", "pada", "21",
"april", "1879", "di", "jepara"
],
"ner": ["PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"],
"srl": [
"ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP",
"ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
],
"qas": [
{
"type": "isian",
"question": "Dimana kartini lahir ___",
"answer": "jepara",
"id": "qa_0_q1"
},
{
"type": "true_false",
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
"options": ["true", "false"],
"answer": "false",
"id": "qa_0_q2"
}
]
},
{
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
"tokens": [
"kerajaan", "majapahit", "berdiri", "pada",
"tahun", "1293", "di", "trowulan"
],
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
"qas": [
{
"type": "opsi",
"question": "Dimana kerajaan majapahit berdiri ___",
"options": ["trowulan", "singasari", "kuta", "banten"],
"answer": "trowulan",
"id": "qa_1_q1"
},
{
"type": "true_false",
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
"options": ["true", "false"],
"answer": "false",
"id": "qa_1_q2"
}
]
},
# Contoh data tidak valid (panjang tidak sama)
{
"context": "contoh data tidak valid",
"tokens": ["contoh", "data", "tidak"],
"ner": ["O", "O"], # Panjang tidak sama dengan tokens
"srl": ["ARG0", "ARG1", "V", "O"], # Panjang tidak sama dengan tokens
"qas": []
}
]
# Jalankan validasi
# valid, invalid = validate_and_sort_data(sample_data)
# Atau jika ingin memuat dari file:
data = load_data_from_file('need_clean_dataset.json')
valid, invalid = validate_and_sort_data(data)

File diff suppressed because it is too large Load Diff

46460
dataset/valid_data.json Normal file

File diff suppressed because it is too large Load Diff

34135
old/QC/cleaned_qg_dataset.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
[
"B-PER"
]

329
old/QC/model_tr.ipynb Normal file
View File

@ -0,0 +1,329 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "94d3889b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1746863381.035097 166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1746863381.038978 166971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1746863381.049265 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746863381.049288 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746863381.049289 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746863381.049290 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-05-10 14:49:41.052642: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"# -------------------------------------------------\n",
"# 0. Import & Konfigurasi\n",
"# -------------------------------------------------\n",
"import json, pickle\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from collections import Counter\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.utils import to_categorical\n",
"from tensorflow.keras.layers import (\n",
" Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,\n",
" TimeDistributed\n",
")\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.callbacks import EarlyStopping\n",
"\n",
"PAD_TOKEN = \"<PAD>\"\n",
"UNK_TOKEN = \"UNK\"\n",
"START_TOKEN = \"<START>\"\n",
"END_TOKEN = \"<END>\"\n",
"MAXLEN_SRC = 100 # Panjang paragraf maksimal\n",
"MAXLEN_TGT = 40 # Panjang pertanyaan/jawaban maksimal\n",
"BATCH = 32\n",
"EPOCHS = 30"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b528b34e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Valid 325 / 325 (invalid index: [])\n"
]
}
],
"source": [
"raw = json.loads(Path(\"normalize_dataset.json\").read_text(encoding=\"utf-8\"))\n",
"\n",
"req = {\"tokens\",\"ner\",\"srl\",\"question\",\"answer\",\"type\"}\n",
"valid, bad = [], []\n",
"for i,item in enumerate(raw):\n",
" if (isinstance(item,dict) and not (req-item.keys())\n",
" and all(isinstance(item[k],list) for k in req-{\"type\"})\n",
" and isinstance(item[\"type\"],str)):\n",
" valid.append(item)\n",
" else:\n",
" bad.append(i)\n",
"\n",
"print(f\"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b18e4617",
"metadata": {},
"outputs": [],
"source": [
"for ex in valid:\n",
" ex[\"question_in\"] = [START_TOKEN] + ex[\"question\"]\n",
" ex[\"question_out\"] = ex[\"question\"] + [END_TOKEN]\n",
"\n",
" ex[\"answer_in\"] = [START_TOKEN] + ex[\"answer\"]\n",
" ex[\"answer_out\"] = ex[\"answer\"] + [END_TOKEN]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "faa30b82",
"metadata": {},
"outputs": [],
"source": [
"tok_token = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
"tok_ner = Tokenizer(lower=False, filters=\"\")\n",
"tok_srl = Tokenizer(lower=False, filters=\"\")\n",
"tok_q = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
"tok_a = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
"tok_type = Tokenizer(lower=False, filters=\"\")\n",
"\n",
"tok_token.fit_on_texts([ex[\"tokens\"] for ex in valid])\n",
"tok_ner.fit_on_texts([ex[\"ner\"] for ex in valid])\n",
"tok_srl.fit_on_texts([ex[\"srl\"] for ex in valid])\n",
"tok_q.fit_on_texts([ex[\"question_in\"]+ex[\"question_out\"] for ex in valid])\n",
"tok_a.fit_on_texts([ex[\"answer_in\"]+ex[\"answer_out\"] for ex in valid])\n",
"tok_type.fit_on_texts([ex[\"type\"] for ex in valid])\n",
"\n",
"# +1 utk padding\n",
"vocab_token = len(tok_token.word_index)+1\n",
"vocab_ner = len(tok_ner.word_index)+1\n",
"vocab_srl = len(tok_srl.word_index)+1\n",
"vocab_q = len(tok_q.word_index)+1\n",
"vocab_a = len(tok_a.word_index)+1\n",
"vocab_type = len(tok_type.word_index)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c83ce734",
"metadata": {},
"outputs": [],
"source": [
"def seqs(field, tok, maxlen):\n",
" return pad_sequences(\n",
" tok.texts_to_sequences([ex[field] for ex in valid]),\n",
" maxlen=maxlen, padding=\"post\"\n",
" )\n",
"\n",
"X_tok = seqs(\"tokens\", tok_token, MAXLEN_SRC)\n",
"X_ner = seqs(\"ner\", tok_ner, MAXLEN_SRC)\n",
"X_srl = seqs(\"srl\", tok_srl, MAXLEN_SRC)\n",
"\n",
"Q_in = seqs(\"question_in\", tok_q, MAXLEN_TGT)\n",
"Q_out = seqs(\"question_out\", tok_q, MAXLEN_TGT)\n",
"A_in = seqs(\"answer_in\", tok_a, MAXLEN_TGT)\n",
"A_out = seqs(\"answer_out\", tok_a, MAXLEN_TGT)\n",
"\n",
"y_type = to_categorical(\n",
" np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex[\"type\"] for ex in valid])]),\n",
" num_classes=vocab_type\n",
")\n",
"\n",
"# Expand dims → (batch, seq, 1) agar cocok dgn sparse_cce\n",
"Q_out = np.expand_dims(Q_out, -1)\n",
"A_out = np.expand_dims(A_out, -1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ad3fe7f2",
"metadata": {},
"outputs": [],
"source": [
"(X_tok_tr, X_tok_te,\n",
" X_ner_tr, X_ner_te,\n",
" X_srl_tr, X_srl_te,\n",
" Q_in_tr, Q_in_te,\n",
" Q_out_tr, Q_out_te,\n",
" A_in_tr, A_in_te,\n",
" A_out_tr, A_out_te,\n",
" y_type_tr,y_type_te) = train_test_split(\n",
" X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,\n",
" test_size=0.2, random_state=42\n",
" )\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f20abfb5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
]
},
{
"ename": "ValueError",
"evalue": "too many values to unpack (expected 3)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m emb_srl \u001b[38;5;241m=\u001b[39m Embedding(vocab_srl, \u001b[38;5;241m16\u001b[39m, mask_zero\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)(enc_srl)\n\u001b[1;32m 9\u001b[0m enc_cat \u001b[38;5;241m=\u001b[39m Concatenate()([emb_tok, emb_ner, emb_srl])\n\u001b[0;32m---> 10\u001b[0m enc_out, state_h, state_c \u001b[38;5;241m=\u001b[39m Bidirectional(\n\u001b[1;32m 11\u001b[0m LSTM(\u001b[38;5;241m256\u001b[39m, return_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_sequences\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 12\u001b[0m )(enc_cat)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ---------- Klasifikasi tipe ----------\u001b[39;00m\n\u001b[1;32m 15\u001b[0m type_out \u001b[38;5;241m=\u001b[39m Dense(vocab_type, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msoftmax\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype_output\u001b[39m\u001b[38;5;124m\"\u001b[39m)(enc_out)\n",
"\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)"
]
}
],
"source": [
"enc_tok = Input(shape=(None,), name=\"enc_tok\")\n",
"enc_ner = Input(shape=(None,), name=\"enc_ner\")\n",
"enc_srl = Input(shape=(None,), name=\"enc_srl\")\n",
"\n",
"emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)\n",
"emb_ner = Embedding(vocab_ner, 16, mask_zero=True)(enc_ner)\n",
"emb_srl = Embedding(vocab_srl, 16, mask_zero=True)(enc_srl)\n",
"\n",
"enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
"enc_out, state_h, state_c = Bidirectional(\n",
" LSTM(256, return_state=True, return_sequences=False)\n",
")(enc_cat)\n",
"\n",
"# ---------- Klasifikasi tipe ----------\n",
"type_out = Dense(vocab_type, activation=\"softmax\", name=\"type_output\")(enc_out)\n",
"\n",
"# ---------- Decoder QUESTION ----------\n",
"dec_q_in = Input(shape=(None,), name=\"dec_q_in\")\n",
"dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)\n",
"dec_q_lstm = LSTM(256, return_sequences=True)\n",
"dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])\n",
"q_out = TimeDistributed(Dense(vocab_q, activation=\"softmax\"), name=\"question_output\")(dec_q_out)\n",
"\n",
"# ---------- Decoder ANSWER ----------\n",
"dec_a_in = Input(shape=(None,), name=\"dec_a_in\")\n",
"dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)\n",
"dec_a_lstm = LSTM(256, return_sequences=True)\n",
"dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])\n",
"a_out = TimeDistributed(Dense(vocab_a, activation=\"softmax\"), name=\"answer_output\")(dec_a_out)\n",
"\n",
"# ---------- Build & compile ----------\n",
"model = Model(\n",
" inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],\n",
" outputs=[q_out, a_out, type_out]\n",
")\n",
"\n",
"model.compile(\n",
" optimizer=\"adam\",\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\" : \"sparse_categorical_crossentropy\",\n",
" \"type_output\" : \"categorical_crossentropy\"\n",
" },\n",
" loss_weights={\n",
" \"question_output\": 1.0,\n",
" \"answer_output\" : 1.0,\n",
" \"type_output\" : 0.3\n",
" },\n",
" metrics={\n",
" \"question_output\": \"accuracy\",\n",
" \"answer_output\" : \"accuracy\",\n",
" \"type_output\" : \"accuracy\"\n",
" }\n",
")\n",
"\n",
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c348406e",
"metadata": {},
"outputs": [],
"source": [
"early = EarlyStopping(patience=3, restore_best_weights=True)\n",
"\n",
"model.fit(\n",
" [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],\n",
" {\"question_output\": Q_out_tr,\n",
" \"answer_output\" : A_out_tr,\n",
" \"type_output\" : y_type_tr},\n",
" batch_size=BATCH,\n",
" epochs=EPOCHS,\n",
" validation_split=0.1,\n",
" callbacks=[early]\n",
")\n",
"\n",
"# -------------------------------------------------\n",
"# 8. Simpan model & tokenizer\n",
"# -------------------------------------------------\n",
"model.save(\"qg_multitask.keras\")\n",
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
" pickle.dump({\n",
" \"token\": tok_token,\n",
" \"ner\" : tok_ner,\n",
" \"srl\" : tok_srl,\n",
" \"q\" : tok_q,\n",
" \"a\" : tok_a,\n",
" \"type\" : tok_type\n",
" }, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -13716,10 +13716,10 @@
"type": "tof"
},
{
"tokens": ["Indonesia", "terletak", "di", "Benua", "Afrika", "."],
"tokens": ["Indonesia", "terletak", "di", "Benua", "asia", "."],
"ner": ["B-LOC", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG1", "V", "ARGM-LOC", "ARGM-LOC", "ARGM-LOC", "O"],
"question": ["Indonesia", "terletak", "di", "Benua", "Afrika", "."],
"question": ["Indonesia", "terletak", "di", "Benua", "asia", "."],
"answer": ["false"],
"type": "tof"
}

1914
old/QC/qg_dataset.json Normal file

File diff suppressed because it is too large Load Diff

703
old/QC/qg_train.ipynb Normal file
View File

@ -0,0 +1,703 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"id": "9bf2159a",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from sklearn.model_selection import train_test_split\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.utils import to_categorical\n",
"\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.layers import (\n",
" Input,\n",
" Embedding,\n",
" LSTM,\n",
" Concatenate,\n",
" Dense,\n",
" TimeDistributed,\n",
")\n",
"from tensorflow.keras.callbacks import EarlyStopping\n",
"from sklearn.metrics import classification_report\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "50118278",
"metadata": {},
"outputs": [],
"source": [
"# # Load raw data\n",
"# with open(\"qg_dataset.json\", encoding=\"utf-8\") as f:\n",
"# raw_data = json.load(f)\n",
"\n",
"# # Validasi lengkap\n",
"# required_keys = {\"tokens\", \"ner\", \"srl\", \"question\", \"answer\", \"type\"}\n",
"# valid_data = []\n",
"# invalid_data = []\n",
"\n",
"# for idx, item in enumerate(raw_data):\n",
"# error_messages = []\n",
"\n",
"# if not isinstance(item, dict):\n",
"# error_messages.append(\"bukan dictionary\")\n",
"\n",
"# missing_keys = required_keys - item.keys()\n",
"# if missing_keys:\n",
"# error_messages.append(f\"missing keys: {missing_keys}\")\n",
"\n",
"# if not error_messages:\n",
"# # Cek tipe data dan None\n",
"# if (not isinstance(item[\"tokens\"], list) or\n",
"# not isinstance(item[\"ner\"], list) or\n",
"# not isinstance(item[\"srl\"], list) or\n",
"# not isinstance(item[\"question\"], list) or\n",
"# not isinstance(item[\"answer\"], list) or\n",
"# not isinstance(item[\"type\"], str)):\n",
"# error_messages.append(\"field type tidak sesuai\")\n",
" \n",
"# if error_messages:\n",
"# print(f\"\\n Index {idx} | Masalah: {', '.join(error_messages)}\")\n",
"# print(json.dumps(item, indent=2, ensure_ascii=False))\n",
"# invalid_data.append(item)\n",
"# continue\n",
"\n",
"# valid_data.append(item)\n",
"\n",
"# # Statistik\n",
"# print(f\"\\n Jumlah data valid: {len(valid_data)} / {len(raw_data)}\")\n",
"# print(f\" Jumlah data tidak valid: {len(invalid_data)}\")\n",
"\n",
"# # Proses data valid\n",
"# tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in valid_data]\n",
"# ner_tags = [item[\"ner\"] for item in valid_data]\n",
"# srl_tags = [item[\"srl\"] for item in valid_data]\n",
"# questions = [[token.lower().strip() for token in item[\"question\"]] for item in valid_data]\n",
"# answers = [[token.lower().strip() for token in item[\"answer\"]] for item in valid_data]\n",
"# types = [item[\"type\"] for item in valid_data]\n",
"\n",
"# type_counts = Counter(types)\n",
"\n",
"# print(type_counts)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "970867e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Jumlah data valid: 396 / 397\n",
"Jumlah data tidak valid: 1\n",
"\n",
"Distribusi Tipe Soal:\n",
"- isian: 390\n",
"- opsi: 4\n",
"- true_false: 2\n"
]
}
],
"source": [
"import json\n",
"from collections import Counter\n",
"\n",
"# Load raw data\n",
"with open(\"../../dataset/dev_dataset_qg.json\", encoding=\"utf-8\") as f:\n",
" raw_data = json.load(f)\n",
"\n",
"# Validasi lengkap\n",
"required_keys = {\"tokens\", \"ner\", \"srl\", \"quiz_possibility\"}\n",
"valid_data = []\n",
"invalid_data = []\n",
"\n",
"for idx, item in enumerate(raw_data):\n",
" error_messages = []\n",
"\n",
" if not isinstance(item, dict):\n",
" error_messages.append(\"bukan dictionary\")\n",
" invalid_data.append(item)\n",
" continue\n",
"\n",
" missing_keys = required_keys - item.keys()\n",
" if missing_keys:\n",
" error_messages.append(f\"missing keys: {missing_keys}\")\n",
"\n",
" if not error_messages:\n",
" # Cek tipe data utama\n",
" if (not isinstance(item[\"tokens\"], list) or\n",
" not isinstance(item[\"ner\"], list) or\n",
" not isinstance(item[\"srl\"], list) or\n",
" not isinstance(item[\"quiz_possibility\"], list)):\n",
" error_messages.append(\"field type tidak sesuai di level utama\")\n",
"\n",
" # Validasi quiz_possibility\n",
" if not error_messages:\n",
" if not item[\"quiz_possibility\"]:\n",
" error_messages.append(\"quiz_possibility kosong\")\n",
" else:\n",
" quiz_item = item[\"quiz_possibility\"][0]\n",
"\n",
" # Validasi kunci di dalam quiz_possibility[0]\n",
" expected_quiz_keys = {\"type\", \"question\", \"answer\"}\n",
" missing_quiz_keys = expected_quiz_keys - quiz_item.keys()\n",
"\n",
" if missing_quiz_keys:\n",
" error_messages.append(f\"missing keys di quiz_possibility[0]: {missing_quiz_keys}\")\n",
" else:\n",
" # Cek tipe data di quiz_possibility[0]\n",
" if (not isinstance(quiz_item[\"type\"], str) or\n",
" not isinstance(quiz_item[\"question\"], list) or\n",
" not isinstance(quiz_item[\"answer\"], list)):\n",
" error_messages.append(\"field type tidak sesuai di quiz_possibility[0]\")\n",
" else:\n",
" # Flatten ke struktur lama untuk konsistensi\n",
" item[\"type\"] = quiz_item[\"type\"]\n",
" item[\"question\"] = quiz_item[\"question\"]\n",
" item[\"answer\"] = quiz_item[\"answer\"]\n",
"\n",
" if error_messages:\n",
" print(f\"\\nIndex {idx} | Masalah: {', '.join(error_messages)}\")\n",
" print(json.dumps(item, indent=2, ensure_ascii=False))\n",
" invalid_data.append(item)\n",
" continue\n",
"\n",
" valid_data.append(item)\n",
"\n",
"# Statistik\n",
"print(f\"\\nJumlah data valid: {len(valid_data)} / {len(raw_data)}\")\n",
"print(f\"Jumlah data tidak valid: {len(invalid_data)}\")\n",
"\n",
"# Proses data valid\n",
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in valid_data]\n",
"ner_tags = [item[\"ner\"] for item in valid_data]\n",
"srl_tags = [item[\"srl\"] for item in valid_data]\n",
"questions = [[token.lower().strip() for token in item[\"question\"]] for item in valid_data]\n",
"answers = [[token.lower().strip() for token in item[\"answer\"]] for item in valid_data]\n",
"types = [item[\"type\"].lower().strip() for item in valid_data] # Konsistensi lowercase untuk tipe\n",
"\n",
"# Statistik tipe soal\n",
"type_counts = Counter(types)\n",
"print(\"\\nDistribusi Tipe Soal:\")\n",
"for t, count in type_counts.items():\n",
" print(f\"- {t}: {count}\")\n",
"\n",
"# (Opsional) Simpan data valid\n",
"with open(\"cleaned_qg_dataset.json\", \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(valid_data, f, ensure_ascii=False, indent=2)\n",
"\n",
"# (Opsional) Simpan data tidak valid untuk analisa\n",
"with open(\"invalid_qg_dataset.json\", \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(invalid_data, f, ensure_ascii=False, indent=2)\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "4e3a0088",
"metadata": {},
"outputs": [],
"source": [
"# tokenize\n",
"token_tok = Tokenizer(lower=False, oov_token=\"UNK\")\n",
"token_ner = Tokenizer(lower=False)\n",
"token_srl = Tokenizer(lower=False)\n",
"token_q = Tokenizer(lower=False)\n",
"token_a = Tokenizer(lower=False)\n",
"token_type = Tokenizer(lower=False)\n",
"\n",
"token_tok.fit_on_texts(tokens)\n",
"token_ner.fit_on_texts(ner_tags)\n",
"token_srl.fit_on_texts(srl_tags)\n",
"token_q.fit_on_texts(questions)\n",
"token_a.fit_on_texts(answers)\n",
"token_type.fit_on_texts(types)\n",
"\n",
"\n",
"maxlen = 20"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "555f9e22",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'opsi', 'isian', 'true_false'}\n"
]
}
],
"source": [
"\n",
"X_tok = pad_sequences(\n",
" token_tok.texts_to_sequences(tokens), padding=\"post\", maxlen=maxlen\n",
")\n",
"X_ner = pad_sequences(\n",
" token_ner.texts_to_sequences(ner_tags), padding=\"post\", maxlen=maxlen\n",
")\n",
"X_srl = pad_sequences(\n",
" token_srl.texts_to_sequences(srl_tags), padding=\"post\", maxlen=maxlen\n",
")\n",
"y_q = pad_sequences(token_q.texts_to_sequences(questions), padding=\"post\", maxlen=maxlen)\n",
"y_a = pad_sequences(token_a.texts_to_sequences(answers), padding=\"post\", maxlen=maxlen)\n",
"\n",
"print(set(types))\n",
"\n",
"y_type = [seq[0] for seq in token_type.texts_to_sequences(types)] # list of int\n",
"y_type = to_categorical(np.array(y_type) - 1, num_classes=len(token_type.word_index))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "f530cfe7",
"metadata": {},
"outputs": [],
"source": [
"X_tok_train, X_tok_test, X_ner_train, X_ner_test, X_srl_train, X_srl_test, \\\n",
"y_q_train, y_q_test, y_a_train, y_a_test, y_type_train, y_type_test = train_test_split(\n",
" X_tok, X_ner, X_srl, y_q, y_a, y_type, test_size=0.2, random_state=42\n",
")\n",
"\n",
"X_train = [X_tok_train, X_ner_train, X_srl_train]\n",
"X_test = [X_tok_test, X_ner_test, X_srl_test]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "255e2a9a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_1\"</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1mModel: \"functional_1\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ tok_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ srl_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_3 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">116,992</span> │ tok_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_4 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">704</span> │ ner_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_5 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">336</span> │ srl_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ concatenate_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">160</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ embedding_3[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>) │ │ │ embedding_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ │ │ │ embedding_5[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ lstm_1 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">427,008</span> │ concatenate_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ get_item_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ lstm_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">GetItem</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">479</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">123,103</span> │ lstm_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">308</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">79,156</span> │ lstm_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ type_output (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,028</span> │ get_item_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ tok_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ srl_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_3 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m116,992\u001b[0m │ tok_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m704\u001b[0m │ ner_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_5 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m336\u001b[0m │ srl_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ concatenate_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m160\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ embedding_3[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ embedding_4[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ embedding_5[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ lstm_1 (\u001b[38;5;33mLSTM\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m427,008\u001b[0m │ concatenate_1[\u001b[38;5;34m0\u001b[0m]… │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ get_item_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mGetItem\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ question_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m479\u001b[0m) │ \u001b[38;5;34m123,103\u001b[0m │ lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ answer_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m308\u001b[0m) │ \u001b[38;5;34m79,156\u001b[0m │ lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ type_output (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m) │ \u001b[38;5;34m1,028\u001b[0m │ get_item_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">748,327</span> (2.85 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m748,327\u001b[0m (2.85 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">748,327</span> (2.85 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m748,327\u001b[0m (2.85 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 176ms/step - answer_output_accuracy: 0.4544 - answer_output_loss: 5.6455 - loss: 13.1436 - question_output_accuracy: 0.3565 - question_output_loss: 6.1017 - type_output_accuracy: 0.6386 - type_output_loss: 1.3766 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 5.0547 - val_loss: 12.0109 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 5.6110 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.3453\n",
"Epoch 2/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9145 - answer_output_loss: 4.3849 - loss: 10.8584 - question_output_accuracy: 0.6760 - question_output_loss: 5.0255 - type_output_accuracy: 0.9758 - type_output_loss: 1.3371 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 2.1055 - val_loss: 6.1782 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.7704 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.3023\n",
"Epoch 3/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9095 - answer_output_loss: 1.7129 - loss: 5.4664 - question_output_accuracy: 0.6777 - question_output_loss: 2.4346 - type_output_accuracy: 0.9795 - type_output_loss: 1.2889 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 1.0023 - val_loss: 4.2358 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.0019 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.2316\n",
"Epoch 4/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9140 - answer_output_loss: 0.9210 - loss: 4.2240 - question_output_accuracy: 0.6804 - question_output_loss: 2.1028 - type_output_accuracy: 0.9812 - type_output_loss: 1.2037 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.7526 - val_loss: 4.0127 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.1652 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.0949\n",
"Epoch 5/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9131 - answer_output_loss: 0.7388 - loss: 4.0409 - question_output_accuracy: 0.6753 - question_output_loss: 2.2497 - type_output_accuracy: 0.9832 - type_output_loss: 1.0455 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6789 - val_loss: 3.6821 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.1028 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.9003\n",
"Epoch 6/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9190 - answer_output_loss: 0.6585 - loss: 3.5809 - question_output_accuracy: 0.6788 - question_output_loss: 2.0865 - type_output_accuracy: 0.9797 - type_output_loss: 0.8341 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6491 - val_loss: 3.3418 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.0148 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.6779\n",
"Epoch 7/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9165 - answer_output_loss: 0.6312 - loss: 3.2776 - question_output_accuracy: 0.6763 - question_output_loss: 2.0259 - type_output_accuracy: 0.9695 - type_output_loss: 0.6233 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6313 - val_loss: 3.1431 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.0432 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.4687\n",
"Epoch 8/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 53ms/step - answer_output_accuracy: 0.9148 - answer_output_loss: 0.6209 - loss: 3.0631 - question_output_accuracy: 0.6762 - question_output_loss: 2.0136 - type_output_accuracy: 0.9708 - type_output_loss: 0.4301 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6193 - val_loss: 2.9071 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 1.9849 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.3029\n",
"Epoch 9/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9155 - answer_output_loss: 0.6067 - loss: 2.7923 - question_output_accuracy: 0.6799 - question_output_loss: 1.9057 - type_output_accuracy: 0.9747 - type_output_loss: 0.2789 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6109 - val_loss: 2.7805 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 1.9768 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.1928\n",
"Epoch 10/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9160 - answer_output_loss: 0.5715 - loss: 2.6738 - question_output_accuracy: 0.6770 - question_output_loss: 1.9091 - type_output_accuracy: 0.9784 - type_output_loss: 0.1873 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6033 - val_loss: 2.6801 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 1.9506 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.1262\n",
"Epoch 11/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9159 - answer_output_loss: 0.5691 - loss: 2.5854 - question_output_accuracy: 0.6791 - question_output_loss: 1.8621 - type_output_accuracy: 0.9743 - type_output_loss: 0.1495 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5962 - val_loss: 2.5971 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.9119 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0890\n",
"Epoch 12/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9151 - answer_output_loss: 0.5528 - loss: 2.4857 - question_output_accuracy: 0.6954 - question_output_loss: 1.8064 - type_output_accuracy: 0.9765 - type_output_loss: 0.1240 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5907 - val_loss: 2.5231 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.8654 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0670\n",
"Epoch 13/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9116 - answer_output_loss: 0.5741 - loss: 2.4910 - question_output_accuracy: 0.6913 - question_output_loss: 1.7912 - type_output_accuracy: 0.9721 - type_output_loss: 0.1279 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5874 - val_loss: 2.4624 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.8207 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0543\n",
"Epoch 14/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 59ms/step - answer_output_accuracy: 0.9142 - answer_output_loss: 0.5370 - loss: 2.4278 - question_output_accuracy: 0.6900 - question_output_loss: 1.7686 - type_output_accuracy: 0.9730 - type_output_loss: 0.1186 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5837 - val_loss: 2.4136 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.7833 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0466\n",
"Epoch 15/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9160 - answer_output_loss: 0.5186 - loss: 2.3183 - question_output_accuracy: 0.6898 - question_output_loss: 1.7028 - type_output_accuracy: 0.9784 - type_output_loss: 0.1001 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5794 - val_loss: 2.3714 - val_question_output_accuracy: 0.7109 - val_question_output_loss: 1.7506 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0414\n",
"Epoch 16/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9171 - answer_output_loss: 0.5077 - loss: 2.2275 - question_output_accuracy: 0.7036 - question_output_loss: 1.6393 - type_output_accuracy: 0.9791 - type_output_loss: 0.0876 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5748 - val_loss: 2.3340 - val_question_output_accuracy: 0.7172 - val_question_output_loss: 1.7214 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0379\n",
"Epoch 17/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9137 - answer_output_loss: 0.5248 - loss: 2.2290 - question_output_accuracy: 0.7070 - question_output_loss: 1.6285 - type_output_accuracy: 0.9828 - type_output_loss: 0.0771 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5716 - val_loss: 2.3017 - val_question_output_accuracy: 0.7172 - val_question_output_loss: 1.6946 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0355\n",
"Epoch 18/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9233 - answer_output_loss: 0.5080 - loss: 2.2392 - question_output_accuracy: 0.7059 - question_output_loss: 1.6139 - type_output_accuracy: 0.9678 - type_output_loss: 0.1205 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5676 - val_loss: 2.2777 - val_question_output_accuracy: 0.7219 - val_question_output_loss: 1.6760 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0341\n",
"Epoch 19/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 53ms/step - answer_output_accuracy: 0.9221 - answer_output_loss: 0.5038 - loss: 2.1188 - question_output_accuracy: 0.7131 - question_output_loss: 1.5706 - type_output_accuracy: 0.9854 - type_output_loss: 0.0616 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5639 - val_loss: 2.2545 - val_question_output_accuracy: 0.7203 - val_question_output_loss: 1.6580 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0326\n",
"Epoch 20/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9175 - answer_output_loss: 0.5233 - loss: 2.1645 - question_output_accuracy: 0.7128 - question_output_loss: 1.5526 - type_output_accuracy: 0.9775 - type_output_loss: 0.0858 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5603 - val_loss: 2.2376 - val_question_output_accuracy: 0.7234 - val_question_output_loss: 1.6450 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0323\n",
"Epoch 21/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9193 - answer_output_loss: 0.5090 - loss: 2.1288 - question_output_accuracy: 0.7118 - question_output_loss: 1.5447 - type_output_accuracy: 0.9828 - type_output_loss: 0.0644 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5568 - val_loss: 2.2206 - val_question_output_accuracy: 0.7219 - val_question_output_loss: 1.6317 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0321\n",
"Epoch 22/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9204 - answer_output_loss: 0.4971 - loss: 2.0726 - question_output_accuracy: 0.7128 - question_output_loss: 1.5100 - type_output_accuracy: 0.9817 - type_output_loss: 0.0626 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5535 - val_loss: 2.2055 - val_question_output_accuracy: 0.7359 - val_question_output_loss: 1.6200 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0320\n",
"Epoch 23/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9191 - answer_output_loss: 0.5003 - loss: 2.1218 - question_output_accuracy: 0.7108 - question_output_loss: 1.5310 - type_output_accuracy: 0.9762 - type_output_loss: 0.0771 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5517 - val_loss: 2.1920 - val_question_output_accuracy: 0.7234 - val_question_output_loss: 1.6081 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0322\n",
"Epoch 24/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 53ms/step - answer_output_accuracy: 0.9220 - answer_output_loss: 0.4808 - loss: 2.0044 - question_output_accuracy: 0.7175 - question_output_loss: 1.4722 - type_output_accuracy: 0.9810 - type_output_loss: 0.0608 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5494 - val_loss: 2.1723 - val_question_output_accuracy: 0.7312 - val_question_output_loss: 1.5905 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0323\n",
"Epoch 25/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 58ms/step - answer_output_accuracy: 0.9183 - answer_output_loss: 0.4965 - loss: 2.0500 - question_output_accuracy: 0.7174 - question_output_loss: 1.4835 - type_output_accuracy: 0.9775 - type_output_loss: 0.0676 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5473 - val_loss: 2.1609 - val_question_output_accuracy: 0.7328 - val_question_output_loss: 1.5810 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0326\n",
"Epoch 26/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9236 - answer_output_loss: 0.4672 - loss: 1.9620 - question_output_accuracy: 0.7220 - question_output_loss: 1.4313 - type_output_accuracy: 0.9780 - type_output_loss: 0.0672 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5454 - val_loss: 2.1488 - val_question_output_accuracy: 0.7344 - val_question_output_loss: 1.5705 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0328\n",
"Epoch 27/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 70ms/step - answer_output_accuracy: 0.9219 - answer_output_loss: 0.4671 - loss: 1.9415 - question_output_accuracy: 0.7288 - question_output_loss: 1.4130 - type_output_accuracy: 0.9765 - type_output_loss: 0.0605 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5440 - val_loss: 2.1382 - val_question_output_accuracy: 0.7359 - val_question_output_loss: 1.5615 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0327\n",
"Epoch 28/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9212 - answer_output_loss: 0.4676 - loss: 1.9277 - question_output_accuracy: 0.7271 - question_output_loss: 1.4106 - type_output_accuracy: 0.9823 - type_output_loss: 0.0526 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5435 - val_loss: 2.1317 - val_question_output_accuracy: 0.7422 - val_question_output_loss: 1.5559 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0323\n",
"Epoch 29/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 57ms/step - answer_output_accuracy: 0.9228 - answer_output_loss: 0.4658 - loss: 1.8773 - question_output_accuracy: 0.7397 - question_output_loss: 1.3683 - type_output_accuracy: 0.9823 - type_output_loss: 0.0487 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5428 - val_loss: 2.1239 - val_question_output_accuracy: 0.7437 - val_question_output_loss: 1.5493 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0319\n",
"Epoch 30/30\n",
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 56ms/step - answer_output_accuracy: 0.9207 - answer_output_loss: 0.4658 - loss: 1.9146 - question_output_accuracy: 0.7355 - question_output_loss: 1.3799 - type_output_accuracy: 0.9795 - type_output_loss: 0.0563 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5421 - val_loss: 2.1174 - val_question_output_accuracy: 0.7437 - val_question_output_loss: 1.5436 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0317\n"
]
}
],
"source": [
"\n",
"inp_tok = Input(shape=(None,), name=\"tok_input\")\n",
"inp_ner = Input(shape=(None,), name=\"ner_input\")\n",
"inp_srl = Input(shape=(None,), name=\"srl_input\")\n",
"\n",
"emb_tok = Embedding(input_dim=len(token_tok.word_index) + 1, output_dim=128)(inp_tok)\n",
"emb_ner = Embedding(input_dim=len(token_ner.word_index) + 1, output_dim=16)(inp_ner)\n",
"emb_srl = Embedding(input_dim=len(token_srl.word_index) + 1, output_dim=16)(inp_srl)\n",
"\n",
"# emb_tok = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_tok)\n",
"# emb_ner = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_ner)\n",
"# emb_srl = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_srl)\n",
"\n",
"merged = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
"\n",
"x = LSTM(256, return_sequences=True)(merged)\n",
"\n",
"out_question = TimeDistributed(Dense(len(token_q.word_index) + 1, activation=\"softmax\"), name=\"question_output\")(x)\n",
"out_answer = TimeDistributed(Dense(len(token_a.word_index) + 1, activation=\"softmax\"), name=\"answer_output\")(x)\n",
"out_type = Dense(len(token_type.word_index), activation=\"softmax\", name=\"type_output\")(\n",
" x[:, 0, :]\n",
") # gunakan step pertama\n",
"\n",
"model = Model(\n",
" inputs=[inp_tok, inp_ner, inp_srl], outputs=[out_question, out_answer, out_type]\n",
")\n",
"model.compile(\n",
" optimizer=\"adam\",\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
" \"type_output\": \"categorical_crossentropy\",\n",
" },\n",
" metrics={\n",
" \"question_output\": \"accuracy\",\n",
" \"answer_output\": \"accuracy\",\n",
" \"type_output\": \"accuracy\",\n",
" },\n",
")\n",
"\n",
"model.summary()\n",
"\n",
"# ----------------------------------------------------------------------------\n",
"# 5. TRAINING\n",
"# ----------------------------------------------------------------------------\n",
"model.fit(\n",
" X_train,\n",
" {\n",
" \"question_output\": np.expand_dims(y_q_train, -1),\n",
" \"answer_output\": np.expand_dims(y_a_train, -1),\n",
" \"type_output\": y_type_train,\n",
" },\n",
" batch_size=64,\n",
" epochs=30,\n",
" validation_split=0.1,\n",
" callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],\n",
")\n",
"\n",
"import pickle\n",
"\n",
"\n",
"model.save(\"new_model_lstm_qg.keras\")\n",
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
" pickle.dump({\n",
" \"token\": token_tok,\n",
" \"ner\": token_ner,\n",
" \"srl\": token_srl,\n",
" \"question\": token_q,\n",
" \"answer\": token_a,\n",
" \"type\": token_type\n",
" }, f)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "06fd86c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 137ms/step\n",
"\n",
"=== Akurasi Detail ===\n",
"Question Accuracy (Token-level): 0.1519\n",
"Answer Accuracy (Token-level) : 0.0638\n",
"Type Accuracy (Class-level) : 1.00\n"
]
}
],
"source": [
"\n",
"def token_level_accuracy(y_true, y_pred):\n",
" correct = 0\n",
" total = 0\n",
" for true_seq, pred_seq in zip(y_true, y_pred):\n",
" for t, p in zip(true_seq, pred_seq):\n",
" if t != 0: # ignore padding\n",
" total += 1\n",
" if t == p:\n",
" correct += 1\n",
" return correct / total if total > 0 else 0\n",
"\n",
"\n",
"# Predict on test set\n",
"y_pred_q, y_pred_a, y_pred_type = model.predict(X_test)\n",
"\n",
"# Decode predictions to class indices\n",
"y_pred_q = np.argmax(y_pred_q, axis=-1)\n",
"y_pred_a = np.argmax(y_pred_a, axis=-1)\n",
"y_pred_type = np.argmax(y_pred_type, axis=-1)\n",
"y_true_type = np.argmax(y_type_test, axis=-1)\n",
"\n",
"# Calculate token-level accuracy\n",
"acc_q = token_level_accuracy(y_q_test, y_pred_q)\n",
"acc_a = token_level_accuracy(y_a_test, y_pred_a)\n",
"\n",
"# Type classification report\n",
"report_type = classification_report(y_true_type, y_pred_type, zero_division=0)\n",
"\n",
"# Print Results\n",
"print(\"\\n=== Akurasi Detail ===\")\n",
"print(f\"Question Accuracy (Token-level): {acc_q:.4f}\")\n",
"print(f\"Answer Accuracy (Token-level) : {acc_a:.4f}\")\n",
"print(f\"Type Accuracy (Class-level) : {np.mean(y_true_type == y_pred_type):.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "b17b6470",
"metadata": {},
"outputs": [],
"source": [
"# import sacrebleu\n",
"# from sacrebleu.metrics import BLEU # optional kalau mau smoothing/effective_order\n",
"\n",
"# idx2tok = {v:k for k,v in word2idx.items()}\n",
"# PAD_ID = word2idx[\"PAD\"]\n",
"# SOS_ID = word2idx.get(\"SOS\", None)\n",
"# EOS_ID = word2idx.get(\"EOS\", None)\n",
"\n",
"# def seq2str(seq):\n",
"# \"\"\"Konversi list index -> kalimat string, sambil buang token spesial.\"\"\"\n",
"# toks = [idx2tok[i] for i in seq\n",
"# if i not in {PAD_ID, SOS_ID, EOS_ID}]\n",
"# return \" \".join(toks).strip().lower()\n",
"\n",
"# bleu_metric = BLEU(effective_order=True) # lebih stabil utk kalimat pendek\n",
"\n",
"# def bleu_corpus(pred_seqs, true_seqs):\n",
"# preds = [seq2str(p) for p in pred_seqs]\n",
"# refs = [[seq2str(t)] for t in true_seqs] # listoflist, satu ref/kalimat\n",
"# return bleu_metric.corpus_score(preds, refs).score\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "d5ed106c",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# flat_true_a, flat_pred_a = flatten_valid(y_a_test, y_pred_a_class)\n",
"# print(\"\\n=== Classification Report: ANSWER ===\")\n",
"# print(classification_report(flat_true_a, flat_pred_a))\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "aa3860de",
"metadata": {},
"outputs": [],
"source": [
"\n",
"# print(\"\\n=== Classification Report: TYPE ===\")\n",
"# print(classification_report(y_true_type_class, y_pred_type_class))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -6,10 +6,10 @@ import numpy as np
def infer_from_input(input_data, maxlen=50):
with open("QC/tokenizers.pkl", "rb") as f:
with open("tokenizers.pkl", "rb") as f:
tokenizers = pickle.load(f)
model = load_model("QC/new_model_lstm_qg.keras")
model = load_model("new_model_lstm_qg.keras")
tok_token = tokenizers["token"]
tok_ner = tokenizers["ner"]
@ -63,42 +63,34 @@ if __name__ == "__main__":
# Example input
input_data = {
"tokens": [
"Ki",
"Hajar",
"Dewantara",
"lahir",
"pada",
"2",
"Mei",
"1889",
"di",
"Yogyakarta",
"Mars",
"disebut",
"juga",
"sebagai",
"planet",
"merah",
"karena",
"permukaannya",
"banyak",
"mengandung",
"zat",
"besi",
".",
],
"ner": [
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O",
"B-LOC",
"O",
],
"ner": ["B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O",
"ARGM-LOC",
"ARG1",
"ARG1",
"ARGM-CAU",
"ARG1",
"ARGM-MNR",
"ARGM-MNR",
"ARG1",
"ARG1",
"O",
],
}

View File

@ -0,0 +1,332 @@
import numpy as np
import pandas as pd
import json
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
Input,
LSTM,
Dense,
Embedding,
Bidirectional,
Concatenate,
Attention,
Dropout,
)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
data = json.load(f)
# Preprocessing function
def preprocess_text(text):
"""Melakukan preprocessing teks dasar"""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
# Persiapkan data untuk model
def prepare_data(data):
"""Siapkan data untuk model"""
contexts = []
tokens_list = []
ner_list = []
srl_list = []
questions = []
answers = []
q_types = []
for item in data:
for qa in item["qas"]:
contexts.append(preprocess_text(item["context"]))
tokens_list.append(item["tokens"])
ner_list.append(item["ner"])
srl_list.append(item["srl"])
questions.append(preprocess_text(qa["question"]))
answers.append(qa["answer"])
q_types.append(qa["type"])
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
# Siapkan data
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
data
)
# Tokenizer untuk teks (context dan question)
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
vocab_size = len(tokenizer.word_index) + 1
# Encoding untuk NER
ner_tokenizer = Tokenizer(oov_token="<OOV>")
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
ner_vocab_size = len(ner_tokenizer.word_index) + 1
# Encoding untuk SRL
srl_tokenizer = Tokenizer(oov_token="<OOV>")
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
srl_vocab_size = len(srl_tokenizer.word_index) + 1
# Encoding untuk tipe pertanyaan
q_type_tokenizer = Tokenizer()
q_type_tokenizer.fit_on_texts(q_types)
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
# Konversi token, ner, srl ke sequences
def tokens_to_sequences(tokens, ner, srl):
"""Konversi token, ner, dan srl ke sequences"""
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
return token_seqs, ner_seqs, srl_seqs
# Menentukan panjang maksimum untuk padding
context_seqs = tokenizer.texts_to_sequences(contexts)
question_seqs = tokenizer.texts_to_sequences(questions)
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
max_context_len = max([len(seq) for seq in context_seqs])
max_question_len = max([len(seq) for seq in question_seqs])
max_token_len = max([len(seq) for seq in token_seqs])
# Pad sequences untuk memastikan semua input sama panjang
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
"""Padding semua sequences"""
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
question_padded = pad_sequences(
question_seqs, maxlen=max_question_len, padding="post"
)
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
return context_padded, question_padded, token_padded, ner_padded, srl_padded
# Siapkan encoder untuk jawaban
answer_tokenizer = Tokenizer(oov_token="<OOV>")
answer_tokenizer.fit_on_texts(answers)
answer_vocab_size = len(answer_tokenizer.word_index) + 1
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
q_type_indices = []
for q_type in q_types:
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_indices.append(q_type_idx)
# Konversi ke numpy array
q_type_indices = np.array(q_type_indices)
# One-hot encode tipe pertanyaan
q_type_categorical = tf.keras.utils.to_categorical(
q_type_indices, num_classes=q_type_vocab_size
)
# Pad sequences
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
)
# Encode jawaban
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
max_answer_len = max([len(seq) for seq in answer_seqs])
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
# Split data menjadi train dan test sets
indices = list(range(len(context_padded)))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
def get_subset(data, indices):
return np.array([data[i] for i in indices])
# Train data
train_context = get_subset(context_padded, train_indices)
train_question = get_subset(question_padded, train_indices)
train_token = get_subset(token_padded, train_indices)
train_ner = get_subset(ner_padded, train_indices)
train_srl = get_subset(srl_padded, train_indices)
train_q_type = get_subset(q_type_categorical, train_indices)
train_answer = get_subset(answer_padded, train_indices)
# Test data
test_context = get_subset(context_padded, test_indices)
test_question = get_subset(question_padded, test_indices)
test_token = get_subset(token_padded, test_indices)
test_ner = get_subset(ner_padded, test_indices)
test_srl = get_subset(srl_padded, test_indices)
test_q_type = get_subset(q_type_categorical, test_indices)
test_answer = get_subset(answer_padded, test_indices)
# Hyperparameters
embedding_dim = 100
lstm_units = 128
ner_embedding_dim = 50
srl_embedding_dim = 50
dropout_rate = 0.3
# Function untuk membuat model
def create_qa_model():
# Input layers
context_input = Input(shape=(max_context_len,), name="context_input")
question_input = Input(shape=(max_question_len,), name="question_input")
token_input = Input(shape=(max_token_len,), name="token_input")
ner_input = Input(shape=(max_token_len,), name="ner_input")
srl_input = Input(shape=(max_token_len,), name="srl_input")
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
# Shared embedding layer for text
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
# Embedding untuk NER dan SRL
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
ner_input
)
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
srl_input
)
# Apply embeddings
context_embed = text_embedding(context_input)
question_embed = text_embedding(question_input)
token_embed = text_embedding(token_input)
# Bi-directional LSTM untuk context dan token-level features
context_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="context_lstm")
)(context_embed)
question_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="question_lstm")
)(question_embed)
# Concat token features (tokens, NER, SRL)
token_features = Concatenate(name="token_features")(
[token_embed, ner_embedding, srl_embedding]
)
token_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="token_lstm")
)(token_features)
# Attention mechanism untuk context dengan memperhatikan question
context_attention = tf.keras.layers.Attention(name="context_attention")(
[context_lstm, question_lstm]
)
# Pool attention outputs
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
context_attention
)
question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")(
question_lstm
)
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
# Concat all features
all_features = Concatenate(name="all_features")(
[context_att_pool, question_pool, token_pool, q_type_input]
)
# Dense layers
x = Dense(256, activation="relu", name="dense_1")(all_features)
x = Dropout(dropout_rate)(x)
x = Dense(128, activation="relu", name="dense_2")(x)
x = Dropout(dropout_rate)(x)
# Output layer untuk jawaban
answer_output = Dense(
answer_vocab_size, activation="softmax", name="answer_output"
)(x)
# Create model
model = Model(
inputs=[
context_input,
question_input,
token_input,
ner_input,
srl_input,
q_type_input,
],
outputs=answer_output,
)
# Compile model
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
# Buat model
model = create_qa_model()
model.summary()
# Callback untuk menyimpan model terbaik
checkpoint = ModelCheckpoint(
"qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1
)
early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
# Training
batch_size = 8
epochs = 50
# Ubah format jawaban untuk sparse categorical crossentropy
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
test_answer_labels = test_answer[:, 0]
# Train model
history = model.fit(
[train_context, train_question, train_token, train_ner, train_srl, train_q_type],
train_answer_labels,
batch_size=batch_size,
epochs=epochs,
validation_data=(
[test_context, test_question, test_token, test_ner, test_srl, test_q_type],
test_answer_labels,
),
callbacks=[checkpoint, early_stop],
)
# Simpan model dan tokenizer
model.save("qa_lstm_model_final.h5")
# Simpan tokenizer
tokenizer_data = {
"word_tokenizer": tokenizer.to_json(),
"ner_tokenizer": ner_tokenizer.to_json(),
"srl_tokenizer": srl_tokenizer.to_json(),
"answer_tokenizer": answer_tokenizer.to_json(),
"q_type_tokenizer": q_type_tokenizer.to_json(),
"max_context_len": max_context_len,
"max_question_len": max_question_len,
"max_token_len": max_token_len,
}
with open("qa_tokenizers.json", "w") as f:
json.dump(tokenizer_data, f)
print("Model dan tokenizer berhasil disimpan!")

View File

@ -0,0 +1,161 @@
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import re
import random
# Load tokenizers and model configurations
with open("qa_tokenizers.json", "r") as f:
tokenizer_data = json.load(f)
tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
answer_tokenizer = tokenizer_from_json(tokenizer_data["answer_tokenizer"])
q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
max_context_len = tokenizer_data["max_context_len"]
max_question_len = tokenizer_data["max_question_len"]
max_token_len = tokenizer_data["max_token_len"]
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
# Load trained model
model = load_model("qa_lstm_model_final.keras")
def preprocess_text(text):
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
def predict_answer(context, question, tokens, ner, srl, q_type):
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_cat = tf.keras.utils.to_categorical(
[q_type_idx], num_classes=q_type_vocab_size
)
# Pad sequences
context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
question_pad = pad_sequences(question_seq, maxlen=max_question_len, padding="post")
token_pad = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
ner_pad = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
# Predict
prediction = model.predict(
[context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0
)
answer_idx = np.argmax(prediction[0])
# Retrieve predicted answer word
for word, idx in answer_tokenizer.word_index.items():
if idx == answer_idx:
return word
return "Unknown"
def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
entities = {}
predicate = ""
for i, token in enumerate(tokens):
if ner[i] != "O":
entities.setdefault(ner[i], []).append(token)
if srl[i] == "V":
predicate = token
elif srl[i].startswith("ARG"):
entities.setdefault(srl[i], []).append(token)
subject = " ".join(entities.get("ARG0", [""]))
if question_type == "isian":
if "LOC" in entities:
location = " ".join(entities["LOC"])
return f"Dimana {subject} {predicate} ___", location
elif "DATE" in entities:
date = " ".join(entities["DATE"])
return f"Kapan {subject} {predicate} ___", date
elif question_type == "true_false":
if "DATE" in entities:
original_date = " ".join(entities["DATE"])
try:
modified_year = str(int(entities["DATE"][-1]) + random.randint(1, 5))
modified_date = (
f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
)
except:
modified_date = original_date # Fallback if parsing fails
return f"{subject} {predicate} pada {modified_date} ___", "false"
elif question_type == "opsi":
if "LOC" in entities:
correct_location = " ".join(entities["LOC"])
distractors = ["singasari", "kuta", "banten", "kediri", "makassar"]
distractors = [d for d in distractors if d != correct_location]
options = random.sample(distractors, 3) + [correct_location]
random.shuffle(options)
return f"Dimana {subject} {predicate} ___", options, correct_location
return "Apa yang terjadi dalam teks ini ___", context
# ✅ Example Usage with Random Sampling
if __name__ == "__main__":
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
data = json.load(f)
# Randomly select an example for testing
test_item = random.choice(data)
test_qa = random.choice(test_item["qas"])
predicted_answer = predict_answer(
test_item["context"],
test_qa["question"],
test_item["tokens"],
test_item["ner"],
test_item["srl"],
test_qa["type"],
)
print(f"Context: {test_item['context']}")
print(f"Question: {test_qa['question']}")
print(f"True Answer: {test_qa['answer']}")
print(f"Predicted Answer: {predicted_answer}")
# Generate Random Question Example
example_context = test_item["context"]
example_tokens = test_item["tokens"]
example_ner = test_item["ner"]
example_srl = test_item["srl"]
random_question_type = random.choice(["isian", "true_false", "opsi"])
result = generate_question_answer(
example_context, example_tokens, example_ner, example_srl, random_question_type
)
print("\nGenerated Question Example:")
print(f"Context: {example_context}")
print(f"Question Type: {random_question_type}")
if random_question_type == "opsi":
question, options, correct_answer = result
print(f"Generated Question: {question}")
print(f"Options: {options}")
print(f"Correct Answer: {correct_answer}")
else:
question, answer = result
print(f"Generated Question: {question}")
print(f"Answer: {answer}")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,54 @@
import json
import re
from collections import OrderedDict
def normalize_question(text):
text = re.sub(r'\s+([?.!,])', r'\1', text)
return text.capitalize()
# Load data
with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
data = json.load(file)
processed_data = []
for idx_entry, entry in enumerate(data):
if not isinstance(entry, dict):
continue
if "context" not in entry:
entry["context"] = " ".join(entry.get("tokens", []))
# Update NER tags: ubah 'V' menjadi 'O'
ner_tags = entry.get("ner", [])
entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]
for idx_qa, qa in enumerate(entry.get("qas", [])):
if "id" not in qa:
qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"
answer = qa.get("answer")
if isinstance(answer, list):
qa["answer"] = " ".join(answer)
question = qa.get("question")
if isinstance(question, list):
question_str = " ".join(question)
qa["question"] = normalize_question(question_str)
# Reorder fields: tokens first, then the rest
ordered_entry = OrderedDict()
if "context" in entry:
ordered_entry["context"] = entry.pop("context")
# Add remaining fields in their original order
for key, value in entry.items():
ordered_entry[key] = value
processed_data.append(ordered_entry)
# Save result
with open('data_converted.json', 'w', encoding='utf-8') as file:
json.dump(processed_data, file, indent=2, ensure_ascii=False)
# Optional: Print first 2 entries for quick verification
print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))

16768
question_generation/data.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
BLEU Score: 0.0585
Validation Accuracy: 0.6740
Validation Loss: 1.8080

View File

View File

@ -0,0 +1,178 @@
[
{
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
"tokens": [
"raden",
"ajeng",
"kartini",
"lahir",
"pada",
"21",
"april",
"1879",
"di",
"jepara"
],
"ner": [
"PER",
"PER",
"PER",
"O",
"O",
"DATE",
"DATE",
"DATE",
"O",
"LOC"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O",
"ARGM-LOC"
],
"qas": [
{
"type": "isian",
"question": "Dimana kartini lahir ___",
"answer": "jepara",
"id": "qa_0_q1"
},
{
"type": "true_false",
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
"options": [
"true",
"false"
],
"answer": "false",
"id": "qa_0_q2"
}
]
},
{
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
"tokens": [
"kerajaan",
"majapahit",
"berdiri",
"pada",
"tahun",
"1293",
"di",
"trowulan"
],
"ner": [
"O",
"ORG",
"O",
"O",
"O",
"DATE",
"O",
"LOC"
],
"srl": [
"ARG1",
"ARG1",
"V",
"O",
"O",
"ARGM-TMP",
"O",
"ARGM-LOC"
],
"qas": [
{
"type": "opsi",
"question": "Dimana kerajaan majapahit berdiri ___",
"options": [
"trowulan",
"singasari",
"kuta",
"banten"
],
"answer": "trowulan",
"id": "qa_1_q1"
},
{
"type": "true_false",
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
"options": [
"true",
"false"
],
"answer": "false",
"id": "qa_1_q2"
}
]
},
{
"context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
"tokens": [
"soekarno",
"dan",
"mohammad",
"hatta",
"memproklamasikan",
"kemerdekaan",
"indonesia",
"pada",
"17",
"agustus",
"1945"
],
"ner": [
"PER",
"O",
"PER",
"PER",
"O",
"O",
"LOC",
"O",
"DATE",
"DATE",
"DATE"
],
"srl": [
"ARG0",
"O",
"ARG0",
"ARG0",
"V",
"ARG1",
"ARGM-LOC",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP"
],
"qas": [
{
"type": "isian",
"question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
"answer": "17 agustus 1945",
"id": "qa_2_q1"
},
{
"type": "opsi",
"question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
"options": [
"soekarno",
"mohammad hatta",
"sudirman",
"ahmad yani"
],
"answer": "soekarno mohammad hatta",
"id": "qa_2_q2"
}
]
}
]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,490 @@
import numpy as np
import pandas as pd
import json
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
Input,
LSTM,
Dense,
Embedding,
Bidirectional,
Concatenate,
Attention,
Dropout,
)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
with open("data_converted.json", "r") as f:
data = json.load(f)
# Preprocessing function
def preprocess_text(text):
"""Melakukan preprocessing teks dasar"""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
# Persiapkan data untuk model
def prepare_data(data):
"""Siapkan data untuk model"""
contexts = []
tokens_list = []
ner_list = []
srl_list = []
questions = []
answers = []
q_types = []
for item in data:
for qa in item["qas"]:
contexts.append(preprocess_text(item["context"]))
tokens_list.append(item["tokens"])
ner_list.append(item["ner"])
srl_list.append(item["srl"])
questions.append(preprocess_text(qa["question"]))
answers.append(qa["answer"])
q_types.append(qa["type"])
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
data
)
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
vocab_size = len(tokenizer.word_index) + 1
# Encoding untuk NER
ner_tokenizer = Tokenizer(oov_token="<OOV>")
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
ner_vocab_size = len(ner_tokenizer.word_index) + 1
# Encoding untuk SRL
srl_tokenizer = Tokenizer(oov_token="<OOV>")
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
srl_vocab_size = len(srl_tokenizer.word_index) + 1
# Encoding untuk tipe pertanyaan
q_type_tokenizer = Tokenizer()
q_type_tokenizer.fit_on_texts(q_types)
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
# Konversi token, ner, srl ke sequences
def tokens_to_sequences(tokens, ner, srl):
"""Konversi token, ner, dan srl ke sequences"""
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
return token_seqs, ner_seqs, srl_seqs
# Menentukan panjang maksimum untuk padding
context_seqs = tokenizer.texts_to_sequences(contexts)
question_seqs = tokenizer.texts_to_sequences(questions)
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
max_context_len = max([len(seq) for seq in context_seqs])
max_question_len = max([len(seq) for seq in question_seqs])
max_token_len = max([len(seq) for seq in token_seqs])
# Pad sequences untuk memastikan semua input sama panjang
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
"""Padding semua sequences"""
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
question_padded = pad_sequences(
question_seqs, maxlen=max_question_len, padding="post"
)
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
return context_padded, question_padded, token_padded, ner_padded, srl_padded
# Siapkan encoder untuk jawaban
answer_tokenizer = Tokenizer(oov_token="<OOV>")
answer_tokenizer.fit_on_texts(answers)
answer_vocab_size = len(answer_tokenizer.word_index) + 1
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
q_type_indices = []
for q_type in q_types:
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_indices.append(q_type_idx)
# Konversi ke numpy array
q_type_indices = np.array(q_type_indices)
# One-hot encode tipe pertanyaan
q_type_categorical = tf.keras.utils.to_categorical(
q_type_indices, num_classes=q_type_vocab_size
)
# Pad sequences
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
)
# Encode jawaban
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
max_answer_len = max([len(seq) for seq in answer_seqs])
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
# Split data menjadi train dan test sets
indices = list(range(len(context_padded)))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
def get_subset(data, indices):
return np.array([data[i] for i in indices])
# Train data
train_context = get_subset(context_padded, train_indices)
train_question = get_subset(question_padded, train_indices)
train_token = get_subset(token_padded, train_indices)
train_ner = get_subset(ner_padded, train_indices)
train_srl = get_subset(srl_padded, train_indices)
train_q_type = get_subset(q_type_categorical, train_indices)
train_answer = get_subset(answer_padded, train_indices)
# Test data
test_context = get_subset(context_padded, test_indices)
test_question = get_subset(question_padded, test_indices)
test_token = get_subset(token_padded, test_indices)
test_ner = get_subset(ner_padded, test_indices)
test_srl = get_subset(srl_padded, test_indices)
test_q_type = get_subset(q_type_categorical, test_indices)
test_answer = get_subset(answer_padded, test_indices)
# Hyperparameters
embedding_dim = 100
lstm_units = 128
ner_embedding_dim = 50
srl_embedding_dim = 50
dropout_rate = 0.3
# Function untuk membuat model dengan dua output: pertanyaan dan jawaban
def create_qa_generator_model():
# Input layers
context_input = Input(shape=(max_context_len,), name="context_input")
token_input = Input(shape=(max_token_len,), name="token_input")
ner_input = Input(shape=(max_token_len,), name="ner_input")
srl_input = Input(shape=(max_token_len,), name="srl_input")
# Tidak perlu question_input dan q_type_input untuk proses generasi
# karena ini akan menjadi output
# Shared embedding layer for text
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
# Embedding untuk NER dan SRL
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
ner_input
)
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
srl_input
)
# Apply embeddings
context_embed = text_embedding(context_input)
token_embed = text_embedding(token_input)
# Bi-directional LSTM untuk context dan token-level features
context_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="context_lstm")
)(context_embed)
# Concat token features (tokens, NER, SRL)
token_features = Concatenate(name="token_features")(
[token_embed, ner_embedding, srl_embedding]
)
token_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="token_lstm")
)(token_features)
# Pool outputs
context_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_pool")(context_lstm)
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
# Concat all features
all_features = Concatenate(name="all_features")([context_pool, token_pool])
# Shared layers
shared = Dense(256, activation="relu", name="shared_dense_1")(all_features)
shared = Dropout(dropout_rate)(shared)
shared = Dense(128, activation="relu", name="shared_dense_2")(shared)
shared = Dropout(dropout_rate)(shared)
# Branch untuk pertanyaan
question_branch = Dense(256, activation="relu", name="question_dense")(shared)
question_branch = Dropout(dropout_rate)(question_branch)
# Branch untuk jawaban
answer_branch = Dense(256, activation="relu", name="answer_dense")(shared)
answer_branch = Dropout(dropout_rate)(answer_branch)
# Output layers
# Untuk pertanyaan, kita buat layer decoder berbasis LSTM untuk menghasilkan sequence kata
# sebagai pertanyaan
question_decoder = LSTM(lstm_units, return_sequences=True, name="question_decoder")(
tf.keras.layers.RepeatVector(max_question_len)(question_branch)
)
question_output = Dense(vocab_size, activation="softmax", name="question_output")(
question_decoder
)
# Output layer untuk jawaban
answer_output = Dense(
answer_vocab_size, activation="softmax", name="answer_output"
)(answer_branch)
# Create model
model = Model(
inputs=[
context_input,
token_input,
ner_input,
srl_input,
],
outputs=[question_output, answer_output],
)
# Compile model dengan loss function dan metrics untuk kedua output
model.compile(
optimizer="adam",
loss={
"question_output": "categorical_crossentropy",
"answer_output": "sparse_categorical_crossentropy",
},
metrics={"question_output": "accuracy", "answer_output": "accuracy"},
loss_weights={"question_output": 1.0, "answer_output": 1.0},
)
return model
# Persiapkan target untuk pertanyaan (one-hot encoded)
# Untuk pertanyaan, kita perlu mengubah ke format categorical karena kita memprediksi
# setiap kata di sequence secara bersamaan
def prepare_question_target(question_padded):
question_target = []
for question in question_padded:
# One-hot encode setiap token dalam sequence
sequence_target = []
for token in question:
# Buat vektor one-hot untuk token ini
token_target = tf.keras.utils.to_categorical(token, num_classes=vocab_size)
sequence_target.append(token_target)
question_target.append(sequence_target)
return np.array(question_target)
# Siapkan target untuk question output
train_question_target = prepare_question_target(train_question)
test_question_target = prepare_question_target(test_question)
# Ubah format jawaban untuk sparse categorical crossentropy
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
test_answer_labels = test_answer[:, 0]
# Buat model
model = create_qa_generator_model()
model.summary()
# Callback untuk menyimpan model terbaik
checkpoint = ModelCheckpoint(
"qa_generator_model.h5",
monitor="val_question_output_accuracy",
save_best_only=True,
verbose=1,
mode="max",
)
early_stop = EarlyStopping(
monitor="val_question_output_accuracy", patience=5, verbose=1, mode="max"
)
# Training
batch_size = 8
epochs = 50
# Train model
history = model.fit(
[train_context, train_token, train_ner, train_srl],
{"question_output": train_question_target, "answer_output": train_answer_labels},
batch_size=batch_size,
epochs=epochs,
validation_data=(
[test_context, test_token, test_ner, test_srl],
{"question_output": test_question_target, "answer_output": test_answer_labels},
),
callbacks=[checkpoint, early_stop],
)
model.save("qa_generator_model_final.keras")
# Simpan tokenizer
tokenizer_data = {
"word_tokenizer": tokenizer.to_json(),
"ner_tokenizer": ner_tokenizer.to_json(),
"srl_tokenizer": srl_tokenizer.to_json(),
"answer_tokenizer": answer_tokenizer.to_json(),
"q_type_tokenizer": q_type_tokenizer.to_json(),
"max_context_len": max_context_len,
"max_question_len": max_question_len,
"max_token_len": max_token_len,
}
with open("qa_generator_tokenizers.json", "w") as f:
json.dump(tokenizer_data, f)
# Fungsi untuk prediksi
def predict_question_and_answer(model, context, tokens, ner, srl):
"""
Prediksi pertanyaan dan jawaban berdasarkan konteks, tokens, NER, dan SRL
"""
# Preprocess input
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
context_padded = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])
token_padded = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])
ner_padded = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])
srl_padded = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
# Prediksi
question_pred, answer_pred = model.predict(
[context_padded, token_padded, ner_padded, srl_padded]
)
# Decode pertanyaan (mengambil indeks dengan probabilitas tertinggi di setiap posisi)
question_indices = np.argmax(question_pred[0], axis=1)
question_words = []
# Reverse word index untuk mendapatkan kata dari indeks
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
# Decode pertanyaan
for idx in question_indices:
if idx != 0: # Skip padding (index 0)
word = index_word.get(idx, "<UNK>")
question_words.append(word)
else:
break # Stop at padding
# Decode jawaban
answer_idx = np.argmax(answer_pred[0])
# Reverse word index untuk jawaban
answer_word_index = answer_tokenizer.word_index
answer_index_word = {v: k for k, v in answer_word_index.items()}
answer = answer_index_word.get(answer_idx, "<UNK>")
# Bentuk pertanyaan
question = " ".join(question_words)
return question, answer
# Contoh penggunaan
# Catatan: Ini hanya contoh, perlu data aktual saat implementasi
"""
sample_context = "Selamat pagi, sekarang adalah hari Senin."
sample_tokens = ["selamat", "pagi", "sekarang", "adalah", "hari", "senin"]
sample_ner = ["O", "O", "O", "O", "O", "B-TIME"]
sample_srl = ["B-V", "B-ARG1", "B-ARGM-TMP", "B-ARGM-PRD", "I-ARGM-PRD", "I-ARGM-PRD"]
# Load model yang sudah dilatih
loaded_model = load_model("qa_generator_model_final.keras")
# Prediksi
question, answer = predict_question_and_answer(
loaded_model, sample_context, sample_tokens, sample_ner, sample_srl
)
print("Konteks:", sample_context)
print("Pertanyaan yang dihasilkan:", question)
print("Jawaban yang dihasilkan:", answer)
"""
sample = {
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
"tokens": [
"kerajaan",
"majapahit",
"berdiri",
"pada",
"tahun",
"1293",
"di",
"trowulan",
],
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
}
question, answer = predict_question_and_answer(
model, sample["context"], sample["tokens"], sample["ner"], sample["srl"]
)
print("Konteks:", sample["context"])
print("Pertanyaan yang dihasilkan:", question)
print("Jawaban yang dihasilkan:", answer)
# Plot history training
# plt.figure(figsize=(12, 8))
# # Plot loss
# plt.subplot(2, 2, 1)
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model Loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper right')
# # Plot question output accuracy
# plt.subplot(2, 2, 2)
# plt.plot(history.history['question_output_accuracy'])
# plt.plot(history.history['val_question_output_accuracy'])
# plt.title('Question Output Accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='lower right')
# # Plot answer output accuracy
# plt.subplot(2, 2, 3)
# plt.plot(history.history['answer_output_accuracy'])
# plt.plot(history.history['val_answer_output_accuracy'])
# plt.title('Answer Output Accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='lower right')
# plt.tight_layout()
# plt.savefig("training_history.png")
# plt.show()

File diff suppressed because one or more lines are too long

308
question_generation/qg.py Normal file
View File

@ -0,0 +1,308 @@
#!/usr/bin/env python3
# ===============================================================
# QuestionGeneration seqtoseq (tokens + NER + SRL → Q/A/type)
# revised version 20250511
# ===============================================================
import json, pickle, random
from pathlib import Path
from itertools import chain
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
Input, Embedding, LSTM, Concatenate,
Dense, TimeDistributed
)
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer, scoring
# -----------------------------------------------------------------
# 0. LOAD & FLATTEN DATA
# -----------------------------------------------------------------
RAW = json.loads(Path("../dataset/dev_dataset_qg.json").read_text())
samples = []
for item in RAW:
for qp in item["quiz_posibility"]:
samples.append({
"tokens" : item["tokens"],
"ner" : item["ner"],
"srl" : item["srl"],
"q_type" : qp["type"], # isian / opsi / benar_salah
"q_toks" : qp["question"] + ["<eos>"],
"a_toks" : (qp["answer"] if isinstance(qp["answer"], list)
else [qp["answer"]]) + ["<eos>"]
})
print("flattened samples :", len(samples))
# -----------------------------------------------------------------
# 1. VOCABULARIES
# -----------------------------------------------------------------
def build_vocab(seq_iter, reserved=("<pad>", "<unk>", "<sos>", "<eos>")):
vocab = {tok: idx for idx, tok in enumerate(reserved)}
for tok in chain.from_iterable(seq_iter):
vocab.setdefault(tok, len(vocab))
return vocab
vocab_tok = build_vocab((s["tokens"] for s in samples))
vocab_ner = build_vocab((s["ner"] for s in samples), reserved=("<pad>","<unk>"))
vocab_srl = build_vocab((s["srl"] for s in samples), reserved=("<pad>","<unk>"))
vocab_q = build_vocab((s["q_toks"] for s in samples))
vocab_a = build_vocab((s["a_toks"] for s in samples))
vocab_typ = {"isian":0, "opsi":1, "benar_salah":2}
# -----------------------------------------------------------------
# 2. ENCODING & PADDING
# -----------------------------------------------------------------
def enc(seq, v): return [v.get(t, v["<unk>"]) for t in seq]
MAX_SENT = max(len(s["tokens"]) for s in samples)
MAX_Q = max(len(s["q_toks"]) for s in samples)
MAX_A = max(len(s["a_toks"]) for s in samples)
def pad_batch(seqs, vmap, maxlen):
return tf.keras.preprocessing.sequence.pad_sequences(
[enc(s, vmap) for s in seqs], maxlen=maxlen, padding="post"
)
X_tok = pad_batch((s["tokens"] for s in samples), vocab_tok, MAX_SENT)
X_ner = pad_batch((s["ner"] for s in samples), vocab_ner, MAX_SENT)
X_srl = pad_batch((s["srl"] for s in samples), vocab_srl, MAX_SENT)
dec_q_in = pad_batch(
([["<sos>"]+s["q_toks"][:-1] for s in samples]), vocab_q, MAX_Q)
dec_q_out = pad_batch((s["q_toks"] for s in samples), vocab_q, MAX_Q)
dec_a_in = pad_batch(
([["<sos>"]+s["a_toks"][:-1] for s in samples]), vocab_a, MAX_A)
dec_a_out = pad_batch((s["a_toks"] for s in samples), vocab_a, MAX_A)
y_type = np.array([vocab_typ[s["q_type"]] for s in samples])
# -----------------------------------------------------------------
# 3. MODEL
# -----------------------------------------------------------------
d_tok, d_tag, units = 128, 32, 256
pad_tok, pad_q, pad_a = vocab_tok["<pad>"], vocab_q["<pad>"], vocab_a["<pad>"]
# ---- Encoder ----------------------------------------------------
inp_tok = Input((MAX_SENT,), name="tok_in")
inp_ner = Input((MAX_SENT,), name="ner_in")
inp_srl = Input((MAX_SENT,), name="srl_in")
emb_tok = Embedding(len(vocab_tok), d_tok, mask_zero=True, name="emb_tok")(inp_tok)
emb_ner = Embedding(len(vocab_ner), d_tag, mask_zero=True, name="emb_ner")(inp_ner)
emb_srl = Embedding(len(vocab_srl), d_tag, mask_zero=True, name="emb_srl")(inp_srl)
enc_concat = Concatenate()([emb_tok, emb_ner, emb_srl])
enc_out, state_h, state_c = LSTM(units, return_state=True, name="enc_lstm")(enc_concat)
# ---- Decoder : Question ----------------------------------------
dec_q_inp = Input((MAX_Q,), name="dec_q_in")
dec_emb_q = Embedding(len(vocab_q), d_tok, mask_zero=True, name="emb_q")(dec_q_inp)
dec_q_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,
name="lstm_q")(dec_emb_q, initial_state=[state_h, state_c])
q_out = TimeDistributed(Dense(len(vocab_q), activation="softmax"), name="q_out")(dec_q_seq)
# ---- Decoder : Answer ------------------------------------------
dec_a_inp = Input((MAX_A,), name="dec_a_in")
dec_emb_a = Embedding(len(vocab_a), d_tok, mask_zero=True, name="emb_a")(dec_a_inp)
dec_a_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,
name="lstm_a")(dec_emb_a, initial_state=[state_h, state_c])
a_out = TimeDistributed(Dense(len(vocab_a), activation="softmax"), name="a_out")(dec_a_seq)
# ---- Classifier -------------------------------------------------
type_out = Dense(len(vocab_typ), activation="softmax", name="type_out")(enc_out)
model = Model(
[inp_tok, inp_ner, inp_srl, dec_q_inp, dec_a_inp],
[q_out, a_out, type_out]
)
# ---- Masked loss helpers ---------------------------------------
scce = tf.keras.losses.SparseCategoricalCrossentropy(reduction="none")
def masked_loss_factory(pad_id):
def loss(y_true, y_pred):
l = scce(y_true, y_pred)
mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
return tf.reduce_sum(l*mask) / tf.reduce_sum(mask)
return loss
model.compile(
optimizer="adam",
loss = {"q_out":masked_loss_factory(pad_q),
"a_out":masked_loss_factory(pad_a),
"type_out":"sparse_categorical_crossentropy"},
loss_weights={"q_out":1.0, "a_out":1.0, "type_out":0.3},
metrics={"q_out":"sparse_categorical_accuracy",
"a_out":"sparse_categorical_accuracy",
"type_out":tf.keras.metrics.SparseCategoricalAccuracy(name="type_acc")}
)
model.summary()
# -----------------------------------------------------------------
# 4. TRAIN
# -----------------------------------------------------------------
history = model.fit(
[X_tok, X_ner, X_srl, dec_q_in, dec_a_in],
[dec_q_out, dec_a_out, y_type],
validation_split=0.1,
epochs=30,
batch_size=64,
callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],
verbose=2
)
model.save("full_seq2seq.keras")
# -----------------------------------------------------------------
# 5. SAVE VOCABS (.pkl keeps python dict intact)
# -----------------------------------------------------------------
def save_vocab(v, name): pickle.dump(v, open(name,"wb"))
save_vocab(vocab_tok,"vocab_tok.pkl"); save_vocab(vocab_ner,"vocab_ner.pkl")
save_vocab(vocab_srl,"vocab_srl.pkl"); save_vocab(vocab_q, "vocab_q.pkl")
save_vocab(vocab_a, "vocab_a.pkl"); save_vocab(vocab_typ,"vocab_typ.pkl")
# -----------------------------------------------------------------
# 6. INFERENCE MODELS (encoder & decoders)
# -----------------------------------------------------------------
def build_inference_models(trained):
# encoder
t_in = Input((MAX_SENT,), name="t_in")
n_in = Input((MAX_SENT,), name="n_in")
s_in = Input((MAX_SENT,), name="s_in")
e_t = trained.get_layer("emb_tok")(t_in)
e_n = trained.get_layer("emb_ner")(n_in)
e_s = trained.get_layer("emb_srl")(s_in)
concat = Concatenate()([e_t,e_n,e_s])
_, h, c = trained.get_layer("enc_lstm")(concat)
enc_model = Model([t_in,n_in,s_in],[h,c])
# questiondecoder
dq_in = Input((1,), name="dq_tok")
dh = Input((units,), name="dh"); dc = Input((units,), name="dc")
dq_emb = trained.get_layer("emb_q")(dq_in)
dq_lstm, nh, nc = trained.get_layer("lstm_q")(dq_emb, initial_state=[dh,dc])
dq_out = trained.get_layer("q_out").layer(dq_lstm)
dec_q_model = Model([dq_in, dh, dc], [dq_out, nh, nc])
# answerdecoder
da_in = Input((1,), name="da_tok")
ah = Input((units,), name="ah"); ac = Input((units,), name="ac")
da_emb = trained.get_layer("emb_a")(da_in)
da_lstm, nh2, nc2 = trained.get_layer("lstm_a")(da_emb, initial_state=[ah,ac])
da_out = trained.get_layer("a_out").layer(da_lstm)
dec_a_model = Model([da_in, ah, ac], [da_out, nh2, nc2])
# type classifier
type_dense = trained.get_layer("type_out")
type_model = Model([t_in,n_in,s_in], type_dense(_)) # use _ = enc_lstm output
return enc_model, dec_q_model, dec_a_model, type_model
encoder_model, decoder_q, decoder_a, classifier_model = build_inference_models(model)
inv_q = {v:k for k,v in vocab_q.items()}
inv_a = {v:k for k,v in vocab_a.items()}
def enc_pad(seq, vmap, maxlen):
x = [vmap.get(t, vmap["<unk>"]) for t in seq]
return x + [vmap["<pad>"]] * (maxlen-len(x))
def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):
et = np.array([enc_pad(tokens, vocab_tok, MAX_SENT)])
en = np.array([enc_pad(ner, vocab_ner, MAX_SENT)])
es = np.array([enc_pad(srl, vocab_srl, MAX_SENT)])
h,c = encoder_model.predict([et,en,es], verbose=0)
# --- question
q_ids = []
tgt = np.array([[vocab_q["<sos>"]]])
for _ in range(max_q):
logits,h,c = decoder_q.predict([tgt,h,c], verbose=0)
nxt = int(logits[0,-1].argmax())
if nxt==vocab_q["<eos>"]: break
q_ids.append(nxt)
tgt = np.array([[nxt]])
# --- answer (reuse fresh h,c)
h,c = encoder_model.predict([et,en,es], verbose=0)
a_ids = []
tgt = np.array([[vocab_a["<sos>"]]])
for _ in range(max_a):
logits,h,c = decoder_a.predict([tgt,h,c], verbose=0)
nxt = int(logits[0,-1].argmax())
if nxt==vocab_a["<eos>"]: break
a_ids.append(nxt)
tgt = np.array([[nxt]])
# --- type
t_id = int(classifier_model.predict([et,en,es], verbose=0).argmax())
return [inv_q[i] for i in q_ids], [inv_a[i] for i in a_ids], \
[k for k,v in vocab_typ.items() if v==t_id][0]
# -----------------------------------------------------------------
# 7. QUICK DEMO
# -----------------------------------------------------------------
test_tokens = ["soekarno","membacakan","teks","proklamasi","pada",
"17","agustus","1945"]
test_ner = ["B-PER","O","O","O","O","B-DATE","I-DATE","I-DATE"]
test_srl = ["ARG0","V","ARG1","ARG1","O","ARGM-TMP","ARGM-TMP","ARGM-TMP"]
q,a,t = greedy_decode(test_tokens,test_ner,test_srl,max_q=MAX_Q,max_a=MAX_A)
print("\nDEMO\n----")
print("Q :", " ".join(q))
print("A :", " ".join(a))
print("T :", t)
# -----------------------------------------------------------------
# 8. EVALUATION (corpuslevel BLEU + ROUGE1/L)
# -----------------------------------------------------------------
smooth = SmoothingFunction().method4
r_scorer = rouge_scorer.RougeScorer(["rouge1","rougeL"], use_stemmer=True)
def strip_special(seq, pad_id, eos_id):
return [x for x in seq if x not in (pad_id, eos_id)]
def ids_to_text(ids, inv):
return " ".join(inv[i] for i in ids)
def evaluate(n=200):
idxs = random.sample(range(len(samples)), n)
refs, hyps = [], []
agg = scoring.BootstrapAggregator()
for i in idxs:
gt_ids = strip_special(dec_q_out[i], pad_q, vocab_q["<eos>"])
ref = ids_to_text(gt_ids, inv_q)
pred = " ".join(greedy_decode(
samples[i]["tokens"],
samples[i]["ner"],
samples[i]["srl"]
)[0])
refs.append([ref.split()])
hyps.append(pred.split())
agg.add_scores(r_scorer.score(ref, pred))
bleu = corpus_bleu(refs, hyps, smoothing_function=smooth)
r1 = agg.aggregate()["rouge1"].mid
rL = agg.aggregate()["rougeL"].mid
print(f"\nEVAL (n={n})")
print(f"BLEU4 : {bleu:.4f}")
print(f"ROUGE1 : P={r1.precision:.3f} R={r1.recall:.3f} F1={r1.fmeasure:.3f}")
print(f"ROUGEL : P={rL.precision:.3f} R={rL.recall:.3f} F1={rL.fmeasure:.3f}")
evaluate(2) # run on 150 random samples

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,357 @@
import numpy as np
import pandas as pd
import json
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
Input,
LSTM,
Dense,
Embedding,
Bidirectional,
Concatenate,
Attention,
Dropout,
)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
# Load data
with open("data_converted.json", "r") as f:
data = json.load(f)
# Preprocessing function
def preprocess_text(text):
"""Melakukan preprocessing teks dasar"""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
# Persiapkan data untuk model prediksi pertanyaan
def prepare_question_prediction_data(data):
"""Siapkan data untuk model prediksi pertanyaan"""
contexts = []
tokens_list = []
ner_list = []
srl_list = []
questions = []
answers = []
q_types = []
for item in data:
for qa in item["qas"]:
contexts.append(preprocess_text(item["context"]))
tokens_list.append(item["tokens"])
ner_list.append(item["ner"])
srl_list.append(item["srl"])
questions.append(preprocess_text(qa["question"]))
q_types.append(qa["type"])
return contexts, tokens_list, ner_list, srl_list, questions, q_types
# Siapkan data
contexts, tokens_list, ner_list, srl_list, questions, q_types = (
prepare_question_prediction_data(data)
)
# Tokenizer untuk teks (context, question, answer)
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1
# Encoding untuk NER
ner_tokenizer = Tokenizer(oov_token="<OOV>")
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
ner_vocab_size = len(ner_tokenizer.word_index) + 1
# Encoding untuk SRL
srl_tokenizer = Tokenizer(oov_token="<OOV>")
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
srl_vocab_size = len(srl_tokenizer.word_index) + 1
# Encoding untuk tipe pertanyaan
q_type_tokenizer = Tokenizer()
q_type_tokenizer.fit_on_texts(q_types)
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
# Konversi token, ner, srl ke sequences
def tokens_to_sequences(tokens, ner, srl):
"""Konversi token, ner, dan srl ke sequences"""
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
return token_seqs, ner_seqs, srl_seqs
# Sequences
context_seqs = tokenizer.texts_to_sequences(contexts)
question_seqs = tokenizer.texts_to_sequences(questions)
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
# Menentukan panjang maksimum untuk padding
max_context_len = max([len(seq) for seq in context_seqs])
max_question_len = max([len(seq) for seq in question_seqs])
max_token_len = max([len(seq) for seq in token_seqs])
# Pad sequences untuk memastikan semua input sama panjang
def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
"""Padding semua sequences"""
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
question_padded = pad_sequences(
question_seqs, maxlen=max_question_len, padding="post"
)
return (
context_padded,
token_padded,
ner_padded,
srl_padded,
question_padded,
)
# Encode tipe pertanyaan
q_type_indices = []
for q_type in q_types:
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_indices.append(q_type_idx)
# Konversi ke numpy array
q_type_indices = np.array(q_type_indices)
# One-hot encode tipe pertanyaan
q_type_categorical = tf.keras.utils.to_categorical(
q_type_indices, num_classes=q_type_vocab_size
)
# Pad sequences
context_padded, token_padded, ner_padded, srl_padded, question_padded = (
pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
)
# Split data menjadi train dan test sets
indices = list(range(len(context_padded)))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
def get_subset(data, indices):
return np.array([data[i] for i in indices])
# Train data
train_context = get_subset(context_padded, train_indices)
train_token = get_subset(token_padded, train_indices)
train_ner = get_subset(ner_padded, train_indices)
train_srl = get_subset(srl_padded, train_indices)
train_q_type = get_subset(q_type_categorical, train_indices)
train_question = get_subset(question_padded, train_indices)
# Test data
test_context = get_subset(context_padded, test_indices)
test_token = get_subset(token_padded, test_indices)
test_ner = get_subset(ner_padded, test_indices)
test_srl = get_subset(srl_padded, test_indices)
test_q_type = get_subset(q_type_categorical, test_indices)
test_question = get_subset(question_padded, test_indices)
# Hyperparameters
embedding_dim = 100
lstm_units = 128
ner_embedding_dim = 50
srl_embedding_dim = 50
dropout_rate = 0.3
# Function untuk membuat model prediksi pertanyaan
def create_question_prediction_model():
# Input layers
context_input = Input(shape=(max_context_len,), name="context_input")
token_input = Input(shape=(max_token_len,), name="token_input")
ner_input = Input(shape=(max_token_len,), name="ner_input")
srl_input = Input(shape=(max_token_len,), name="srl_input")
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
# Shared embedding layer for text
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
# Embedding untuk NER dan SRL
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
ner_input
)
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
srl_input
)
# Apply embeddings
context_embed = text_embedding(context_input)
token_embed = text_embedding(token_input)
# Bi-directional LSTM untuk context dan token-level features
context_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="context_lstm")
)(context_embed)
# Concat token features (tokens, NER, SRL)
token_features = Concatenate(name="token_features")(
[token_embed, ner_embedding, srl_embedding]
)
token_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="token_lstm")
)(token_features)
context_attention = tf.keras.layers.Attention(name="context_attention")(
context_lstm
)
# Pool attention outputs
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
context_attention
)
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
# Concat all features
all_features = Concatenate(name="all_features")(
[context_att_pool, token_pool, q_type_input]
)
# Dense layers with expanded capacity for sequence generation
x = Dense(512, activation="relu", name="dense_1")(all_features)
x = Dropout(dropout_rate)(x)
x = Dense(256, activation="relu", name="dense_2")(x)
x = Dropout(dropout_rate)(x)
# Reshape untuk sequence decoder
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
# Many-to-many architecture for sequence generation
# Decoder LSTM
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
# Reshape untuk input ke decoder
decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
decoder_input_reshaped = tf.keras.layers.Reshape((1, lstm_units * 2))(decoder_input)
# Decoder sequence with teacher forcing
# Expand dimensionality to match expected sequence length
repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
# Process through decoder LSTM
decoder_outputs = decoder_lstm(repeated_vector)
# Apply dense layer to each timestep
question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
decoder_outputs
)
# Create model
model = Model(
inputs=[
context_input,
token_input,
ner_input,
srl_input,
q_type_input,
],
outputs=question_output_seq,
)
# Compile model with categorical crossentropy for sequence prediction
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
# Buat model
model = create_question_prediction_model()
model.summary()
# Callback untuk menyimpan model terbaik
checkpoint = ModelCheckpoint(
"question_prediction_model.h5",
monitor="val_accuracy",
save_best_only=True,
verbose=1,
)
early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
# Reshaping question data for sequence-to-sequence training
# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
train_question_target = np.expand_dims(train_question, -1)
test_question_target = np.expand_dims(test_question, -1)
# Training parameters
batch_size = 8
epochs = 50
# Train model
history = model.fit(
[train_context, train_token, train_ner, train_srl, train_q_type],
train_question_target,
batch_size=batch_size,
epochs=epochs,
validation_data=(
[test_context, test_token, test_ner, test_srl, test_q_type],
test_question_target,
),
callbacks=[checkpoint, early_stop],
)
# # Plot training history
# plt.figure(figsize=(12, 4))
# plt.subplot(1, 2, 1)
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model Accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.subplot(1, 2, 2)
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model Loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.tight_layout()
# plt.savefig('question_prediction_training_history.png')
# plt.show()
# Simpan model dan tokenizer
model.save("question_prediction_model_final.h5")
# Simpan tokenizer
tokenizer_data = {
"word_tokenizer": tokenizer.to_json(),
"ner_tokenizer": ner_tokenizer.to_json(),
"srl_tokenizer": srl_tokenizer.to_json(),
"q_type_tokenizer": q_type_tokenizer.to_json(),
"max_context_len": max_context_len,
"max_question_len": max_question_len,
"max_token_len": max_token_len,
}
with open("question_prediction_tokenizers.json", "w") as f:
json.dump(tokenizer_data, f)
print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")

View File

@ -0,0 +1,473 @@
import numpy as np
import json
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
Input,
LSTM,
Dense,
Embedding,
Bidirectional,
Concatenate,
Dropout,
)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
# Load data
with open("data_converted.json", "r") as f:
data = json.load(f)
# Preprocessing function
def preprocess_text(text):
"""Melakukan preprocessing teks dasar"""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
# Persiapkan data untuk model prediksi pertanyaan
def prepare_question_prediction_data(data):
"""Siapkan data untuk model prediksi pertanyaan"""
contexts = []
tokens_list = []
ner_list = []
srl_list = []
questions = []
q_types = []
for item in data:
for qa in item["qas"]:
contexts.append(preprocess_text(item["context"]))
tokens_list.append(item["tokens"])
ner_list.append(item["ner"])
srl_list.append(item["srl"])
questions.append(preprocess_text(qa["question"]))
q_types.append(qa["type"])
# Tidak mengambil jawaban (answer) sebagai input
return contexts, tokens_list, ner_list, srl_list, questions, q_types
# Siapkan data
contexts, tokens_list, ner_list, srl_list, questions, q_types = (
prepare_question_prediction_data(data)
)
# Tokenizer untuk teks (context, question)
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
tokenizer.fit_on_texts(all_texts)
vocab_size = len(tokenizer.word_index) + 1
# Encoding untuk NER
ner_tokenizer = Tokenizer(oov_token="<OOV>")
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
ner_vocab_size = len(ner_tokenizer.word_index) + 1
# Encoding untuk SRL
srl_tokenizer = Tokenizer(oov_token="<OOV>")
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
srl_vocab_size = len(srl_tokenizer.word_index) + 1
# Encoding untuk tipe pertanyaan
q_type_tokenizer = Tokenizer()
q_type_tokenizer.fit_on_texts(q_types)
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
# Konversi token, ner, srl ke sequences
def tokens_to_sequences(tokens, ner, srl):
"""Konversi token, ner, dan srl ke sequences"""
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
return token_seqs, ner_seqs, srl_seqs
# Sequences
context_seqs = tokenizer.texts_to_sequences(contexts)
question_seqs = tokenizer.texts_to_sequences(questions)
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
# Menentukan panjang maksimum untuk padding
max_context_len = max([len(seq) for seq in context_seqs])
max_question_len = max([len(seq) for seq in question_seqs])
max_token_len = max([len(seq) for seq in token_seqs])
# Pad sequences untuk memastikan semua input sama panjang
def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
"""Padding semua sequences"""
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
question_padded = pad_sequences(
question_seqs, maxlen=max_question_len, padding="post"
)
return (
context_padded,
token_padded,
ner_padded,
srl_padded,
question_padded,
)
# Encode tipe pertanyaan
q_type_indices = []
for q_type in q_types:
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_indices.append(q_type_idx)
# Konversi ke numpy array
q_type_indices = np.array(q_type_indices)
# One-hot encode tipe pertanyaan
q_type_categorical = tf.keras.utils.to_categorical(
q_type_indices, num_classes=q_type_vocab_size
)
# Pad sequences
context_padded, token_padded, ner_padded, srl_padded, question_padded = (
pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
)
# Split data menjadi train dan test sets
indices = list(range(len(context_padded)))
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
def get_subset(data, indices):
return np.array([data[i] for i in indices])
# Train data
train_context = get_subset(context_padded, train_indices)
train_token = get_subset(token_padded, train_indices)
train_ner = get_subset(ner_padded, train_indices)
train_srl = get_subset(srl_padded, train_indices)
train_q_type = get_subset(q_type_categorical, train_indices)
train_question = get_subset(question_padded, train_indices)
# Test data
test_context = get_subset(context_padded, test_indices)
test_token = get_subset(token_padded, test_indices)
test_ner = get_subset(ner_padded, test_indices)
test_srl = get_subset(srl_padded, test_indices)
test_q_type = get_subset(q_type_categorical, test_indices)
test_question = get_subset(question_padded, test_indices)
# Hyperparameters
embedding_dim = 100
lstm_units = 128
ner_embedding_dim = 50
srl_embedding_dim = 50
dropout_rate = 0.3
# Function untuk membuat model prediksi pertanyaan
def create_question_prediction_model():
# Input layers
context_input = Input(shape=(max_context_len,), name="context_input")
token_input = Input(shape=(max_token_len,), name="token_input")
ner_input = Input(shape=(max_token_len,), name="ner_input")
srl_input = Input(shape=(max_token_len,), name="srl_input")
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
# Shared embedding layer for text
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
# Embedding untuk NER dan SRL
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
ner_input
)
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
srl_input
)
# Apply embeddings
context_embed = text_embedding(context_input)
token_embed = text_embedding(token_input)
# Bi-directional LSTM untuk context dan token-level features
context_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="context_lstm")
)(context_embed)
# Concat token features (tokens, NER, SRL)
token_features = Concatenate(name="token_features")(
[token_embed, ner_embedding, srl_embedding]
)
token_lstm = Bidirectional(
LSTM(lstm_units, return_sequences=True, name="token_lstm")
)(token_features)
# Apply attention to context LSTM
context_attention = tf.keras.layers.Attention(name="context_attention")(
[context_lstm, context_lstm]
)
# Pool attention outputs
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
context_attention
)
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
# Concat all features (tidak ada answer feature)
all_features = Concatenate(name="all_features")(
[context_att_pool, token_pool, q_type_input]
)
# Dense layers with expanded capacity for sequence generation
x = Dense(512, activation="relu", name="dense_1")(all_features)
x = Dropout(dropout_rate)(x)
x = Dense(256, activation="relu", name="dense_2")(x)
x = Dropout(dropout_rate)(x)
# Reshape untuk sequence decoder
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
# Many-to-many architecture for sequence generation
# Decoder LSTM
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
# Reshape untuk input ke decoder
decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
# Decoder sequence with teacher forcing
# Expand dimensionality to match expected sequence length
repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
# Process through decoder LSTM
decoder_outputs = decoder_lstm(repeated_vector)
# Apply dense layer to each timestep
question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
decoder_outputs
)
# Create model
model = Model(
inputs=[
context_input,
token_input,
ner_input,
srl_input,
q_type_input,
],
outputs=question_output_seq,
)
# Compile model with categorical crossentropy for sequence prediction
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
# Buat model
model = create_question_prediction_model()
model.summary()
# Callback untuk menyimpan model terbaik
checkpoint = ModelCheckpoint(
"question_prediction_model.h5",
monitor="val_accuracy",
save_best_only=True,
verbose=1,
)
early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
# Reshaping question data for sequence-to-sequence training
# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
train_question_target = np.expand_dims(train_question, -1)
test_question_target = np.expand_dims(test_question, -1)
# Training parameters
batch_size = 8
epochs = 50
# Train model
history = model.fit(
[train_context, train_token, train_ner, train_srl, train_q_type],
train_question_target,
batch_size=batch_size,
epochs=epochs,
validation_data=(
[test_context, test_token, test_ner, test_srl, test_q_type],
test_question_target,
),
callbacks=[checkpoint, early_stop],
)
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"], loc="upper left")
plt.subplot(1, 2, 2)
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Validation"], loc="upper left")
plt.tight_layout()
plt.savefig("question_prediction_training_history.png")
plt.show()
# Simpan model dan tokenizer
model.save("question_prediction_model_final.h5")
# Simpan tokenizer
tokenizer_data = {
"word_tokenizer": tokenizer.to_json(),
"ner_tokenizer": ner_tokenizer.to_json(),
"srl_tokenizer": srl_tokenizer.to_json(),
"q_type_tokenizer": q_type_tokenizer.to_json(),
"max_context_len": max_context_len,
"max_question_len": max_question_len,
"max_token_len": max_token_len,
}
with open("question_prediction_tokenizers.json", "w") as f:
json.dump(tokenizer_data, f)
print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
# Fungsi untuk memprediksi pertanyaan
def predict_question(context, tokens, ner, srl, q_type):
context = preprocess_text(context)
context_seq = tokenizer.texts_to_sequences([context])[0]
token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])[0]
ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
context_padded = pad_sequences(
[context_seq], maxlen=max_context_len, padding="post"
)
token_padded = pad_sequences([token_seq], maxlen=max_token_len, padding="post")
ner_padded = pad_sequences([ner_seq], maxlen=max_token_len, padding="post")
srl_padded = pad_sequences([srl_seq], maxlen=max_token_len, padding="post")
# Q-type one-hot encoding
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
q_type_one_hot = tf.keras.utils.to_categorical(
[q_type_idx], num_classes=q_type_vocab_size
)
# Predict
pred = model.predict(
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot],
verbose=1,
)
# Convert prediction to words
pred_seq = np.argmax(pred[0], axis=1)
# Convert indices to words
reverse_word_map = {v: k for k, v in tokenizer.word_index.items()}
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
return " ".join(pred_words)
def evaluate_model_performance(test_data):
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
# Lists to store scores
bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougel_scores = []
# Iterate through test data
for i in range(len(test_data)):
# Get test sample
sample_context = contexts[test_data[i]]
sample_tokens = tokens_list[test_data[i]]
sample_ner = ner_list[test_data[i]]
sample_srl = srl_list[test_data[i]]
sample_q_type = q_types[test_data[i]]
actual_question = questions[test_data[i]]
# Predict question
pred_question = predict_question(
sample_context, sample_tokens, sample_ner, sample_srl, sample_q_type
)
# Tokenize for BLEU score
actual_tokens = actual_question.split()
pred_tokens = pred_question.split()
# Calculate BLEU score
# Using unigram, bigram, trigram, and 4-gram
print("kaliamt aktual", actual_tokens)
print("kaliamt prediksi", pred_tokens)
bleu_score = sentence_bleu([actual_tokens], pred_tokens)
bleu_scores.append(bleu_score)
try:
rouge_scores = scorer.score(actual_question, pred_question)
# Extract F1 scores
rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
rouge2_scores.append(rouge_scores["rouge2"].fmeasure)
rougel_scores.append(rouge_scores["rougeL"].fmeasure)
except Exception as e:
print(f"Error calculating ROUGE score: {e}")
# Calculate average scores
results = {
"avg_bleu_score": np.mean(bleu_scores),
"avg_rouge1": np.mean(rouge1_scores),
"avg_rouge2": np.mean(rouge2_scores),
"avg_rougel": np.mean(rougel_scores),
}
return results
loaded_model = load_model("question_prediction_model_final.h5")
with open("question_prediction_tokenizers.json", "r") as f:
tokenizer_data = json.load(f)
# Ambil beberapa sampel dari data test
sample_idx = random.randint(0, len(test_indices) - 1)
sample_context = contexts[test_indices[sample_idx]]
sample_tokens = tokens_list[test_indices[sample_idx]]
sample_ner = ner_list[test_indices[sample_idx]]
sample_srl = srl_list[test_indices[sample_idx]]
sample_q_type = q_types[test_indices[sample_idx]]
performance_metrics = evaluate_model_performance(test_indices)
print("\nModel Performance Metrics:")
print(f"Average BLEU Score: {performance_metrics['avg_bleu_score']:.4f}")
print(f"Average ROUGE-1 Score: {performance_metrics['avg_rouge1']:.4f}")
print(f"Average ROUGE-2 Score: {performance_metrics['avg_rouge2']:.4f}")
print(f"Average ROUGE-L Score: {performance_metrics['avg_rougel']:.4f}")

View File

@ -0,0 +1,210 @@
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import re
class QuestionPredictionModel:
def __init__(self, model_path, tokenizer_path):
"""
Initialize question prediction model with pre-trained model and tokenizers
"""
# Load model
self.model = load_model(model_path)
# Load tokenizers
with open(tokenizer_path, 'r') as f:
tokenizer_data = json.load(f)
# Reconstruct tokenizers
self.word_tokenizer = tokenizer_from_json(tokenizer_data['word_tokenizer'])
self.ner_tokenizer = tokenizer_from_json(tokenizer_data['ner_tokenizer'])
self.srl_tokenizer = tokenizer_from_json(tokenizer_data['srl_tokenizer'])
self.q_type_tokenizer = tokenizer_from_json(tokenizer_data['q_type_tokenizer'])
# Get max lengths
self.max_context_len = tokenizer_data['max_context_len']
self.max_answer_len = tokenizer_data['max_answer_len']
self.max_question_len = tokenizer_data['max_question_len']
self.max_token_len = tokenizer_data['max_token_len']
# Get vocabulary sizes
self.vocab_size = len(self.word_tokenizer.word_index) + 1
self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
def preprocess_text(self, text):
"""Basic text preprocessing"""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
def predict_question(self, context, answer, tokens, ner, srl, q_type):
"""
Predict a question based on given context, answer, tokens, NER, SRL, and question type
Args:
context (str): The context text
answer (str): The answer to generate a question for
tokens (list): List of tokens
ner (list): List of NER tags corresponding to tokens
srl (list): List of SRL tags corresponding to tokens
q_type (str): Question type ('isian', 'opsi', or 'true_false')
Returns:
str: The predicted question
"""
# Preprocess inputs
context = self.preprocess_text(context)
answer = self.preprocess_text(answer)
# Convert to sequences
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0]
tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
# Pad sequences
context_padded = pad_sequences([context_seq], maxlen=self.max_context_len, padding="post")
answer_padded = pad_sequences([answer_seq], maxlen=self.max_answer_len, padding="post")
tokens_padded = pad_sequences([tokens_seq], maxlen=self.max_token_len, padding="post")
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
# One-hot encode question type
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
q_type_categorical = tf.keras.utils.to_categorical(
[q_type_idx], num_classes=self.q_type_vocab_size
)
# Make prediction
predicted_seq = self.model.predict(
[context_padded, answer_padded, tokens_padded, ner_padded, srl_padded, q_type_categorical]
)
# Convert predictions to tokens (taking the highest probability token at each position)
predicted_indices = np.argmax(predicted_seq[0], axis=1)
# Create reversed word index for converting indices back to words
reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()}
# Convert indices to words
predicted_words = []
for idx in predicted_indices:
if idx != 0: # Skip padding tokens
predicted_words.append(reverse_word_index.get(idx, ''))
# Form the question
predicted_question = ' '.join(predicted_words)
# Add "___" to the end based on question type convention
if "___" not in predicted_question:
predicted_question += " ___"
return predicted_question
def batch_predict_questions(self, data):
"""
Predict questions for a batch of data
Args:
data (list): List of dictionaries with context, tokens, ner, srl, and answers
Returns:
list: List of predicted questions
"""
results = []
for item in data:
context = item["context"]
tokens = item["tokens"]
ner = item["ner"]
srl = item["srl"]
# If there are Q&A pairs, use them for evaluation
if "qas" in item:
for qa in item["qas"]:
answer = qa["answer"]
q_type = qa["type"]
ground_truth = qa["question"]
predicted_question = self.predict_question(
context, answer, tokens, ner, srl, q_type
)
results.append({
"context": context,
"answer": answer,
"predicted_question": predicted_question,
"ground_truth": ground_truth,
"question_type": q_type
})
else:
# If no Q&A pairs, generate questions for all question types
for q_type in ["isian", "true_false", "opsi"]:
# For demo purposes, use a placeholder answer (would need actual answers in real use)
# In practice, you might extract potential answers from the context
placeholders = {
"isian": "placeholder",
"true_false": "true",
"opsi": "placeholder"
}
predicted_question = self.predict_question(
context, placeholders[q_type], tokens, ner, srl, q_type
)
results.append({
"context": context,
"predicted_question": predicted_question,
"question_type": q_type
})
return results
# Example usage
if __name__ == "__main__":
# Load test data
with open("data_converted.json", "r") as f:
test_data = json.load(f)
# Initialize model
question_predictor = QuestionPredictionModel(
model_path="question_prediction_model_final.h5",
tokenizer_path="question_prediction_tokenizers.json"
)
# Example single prediction
sample = test_data[0]
context = sample["context"]
tokens = sample["tokens"]
ner = sample["ner"]
srl = sample["srl"]
answer = sample["qas"][0]["answer"]
q_type = sample["qas"][0]["type"]
predicted_question = question_predictor.predict_question(
context, answer, tokens, ner, srl, q_type
)
print(f"Context: {context}")
print(f"Answer: {answer}")
print(f"Question Type: {q_type}")
print(f"Predicted Question: {predicted_question}")
print(f"Ground Truth: {sample['qas'][0]['question']}")
# Batch prediction
results = question_predictor.batch_predict_questions(test_data[:3])
print("\nBatch Results:")
for i, result in enumerate(results):
print(f"\nResult {i+1}:")
print(f"Context: {result['context']}")
print(f"Answer: {result.get('answer', 'N/A')}")
print(f"Question Type: {result['question_type']}")
print(f"Predicted Question: {result['predicted_question']}")
if 'ground_truth' in result:
print(f"Ground Truth: {result['ground_truth']}")

View File

@ -0,0 +1,188 @@
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import re
class QuestionPredictionModel:
def __init__(self, model_path, tokenizer_path):
"""
Initialize question prediction model with pre-trained model and tokenizers
"""
# Load model
self.model = load_model(model_path)
# Load tokenizers
with open(tokenizer_path, "r") as f:
tokenizer_data = json.load(f)
# Reconstruct tokenizers
self.word_tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
self.ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
self.srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
self.q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
# Get max lengths
self.max_context_len = tokenizer_data["max_context_len"]
self.max_question_len = tokenizer_data["max_question_len"]
self.max_token_len = tokenizer_data["max_token_len"]
# Get vocabulary sizes
self.vocab_size = len(self.word_tokenizer.word_index) + 1
self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
def preprocess_text(self, text):
"""Basic text preprocessing"""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
def predict_question(self, context, tokens, ner, srl, q_type):
"""Prediksi pertanyaan berdasarkan konteks dan fitur lainnya"""
# Preprocess
context = self.preprocess_text(context)
# Convert to sequences
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
# Pad sequences
context_padded = pad_sequences(
[context_seq], maxlen=self.max_context_len, padding="post"
)
token_padded = pad_sequences(
[token_seq], maxlen=self.max_token_len, padding="post"
)
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
# Q-type one-hot encoding
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
q_type_one_hot = tf.keras.utils.to_categorical(
[q_type_idx], num_classes=self.q_type_vocab_size
)
# Predict
pred = self.model.predict(
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot]
)
# Convert prediction to words
pred_seq = np.argmax(pred[0], axis=1)
# Convert indices to words
reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()}
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
return " ".join(pred_words)
def batch_predict_questions(self, data):
"""
Predict questions for a batch of data
Args:
data (list): List of dictionaries with context, tokens, ner, srl, and answers
Returns:
list: List of predicted questions
"""
results = []
for item in data:
context = item["context"]
tokens = item["tokens"]
ner = item["ner"]
srl = item["srl"]
# If there are Q&A pairs, use them for evaluation
if "qas" in item:
for qa in item["qas"]:
q_type = qa["type"]
ground_truth = qa["question"]
predicted_question = self.predict_question(
context, tokens, ner, srl, q_type
)
results.append(
{
"context": context,
"predicted_question": predicted_question,
"ground_truth": ground_truth,
"question_type": q_type,
}
)
else:
# If no Q&A pairs, generate questions for all question types
for q_type in ["isian", "true_false", "opsi"]:
# For demo purposes, use a placeholder answer (would need actual answers in real use)
# In practice, you might extract potential answers from the context
placeholders = {
"isian": "placeholder",
"true_false": "true",
"opsi": "placeholder",
}
predicted_question = self.predict_question(
context, placeholders[q_type], tokens, ner, srl, q_type
)
results.append(
{
"context": context,
"predicted_question": predicted_question,
"question_type": q_type,
}
)
return results
# Example usage
if __name__ == "__main__":
# Load test data
with open("../dataset/conteks_question.json", "r") as f:
test_data = json.load(f)
# Initialize model
question_predictor = QuestionPredictionModel(
model_path="question_prediction_model_final.h5",
tokenizer_path="question_prediction_tokenizers.json",
)
# Example single prediction
sample = test_data[1]
context = sample["context"]
tokens = sample["tokens"]
ner = sample["ner"]
srl = sample["srl"]
answer = sample["qas"][0]["answer"]
q_type = sample["qas"][0]["type"]
predicted_question = question_predictor.predict_question(
context, tokens, ner, srl, q_type
)
print(f"Context: {context}")
print(f"Answer: {answer}")
print(f"Question Type: {q_type}")
print(f"Predicted Question: {predicted_question}")
print(f"Ground Truth: {sample['qas'][0]['question']}")
# Batch prediction
# results = question_predictor.batch_predict_questions(test_data[:3])
# print("\nBatch Results:")
# for i, result in enumerate(results):
# print(f"\nResult {i+1}:")
# print(f"Context: {result['context']}")
# print(f"Answer: {result.get('answer', 'N/A')}")
# print(f"Question Type: {result['question_type']}")
# print(f"Predicted Question: {result['predicted_question']}")
# if "ground_truth" in result:
# print(f"Ground Truth: {result['ground_truth']}")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

389
question_generation/uji.py Normal file
View File

@ -0,0 +1,389 @@
# ===============================================================
# Seq2SeqLSTM + Luong Attention untuk QuestionAnswer Generator
# + Greedy & Beam Search decoding + BLEU4 evaluation
# ===============================================================
# • Semua embedding mask_zero=True (padding dimask)
# • Encoder = Bidirectional LSTM (return_sequences=True)
# • Decoder = LSTM + Luong Attention (keras.layers.Attention).
# • Greedy & beamsearch inference submodel dibangun terpisah (encoder,
# decoderQstep, decoderAstep).
# • BLEU score (nltk.corpus_bleu) untuk evaluasi pertanyaan & jawaban.
# ---------------------------------------------------------------
# PETUNJUK
# 1. pip install nltk
# 2. python seq2seq_qa_attention.py # train + simpan model
# 3. jalankan fungsi evaluate_bleu() # hitung BLEU di validation/test
# ===============================================================
import json
from pathlib import Path
from itertools import chain
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
Input, Embedding, LSTM, Bidirectional, Concatenate,
Dense, TimeDistributed, Attention
)
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import corpus_bleu # pip install nltk
# ----------------------- 1. Load & flatten data ----------------------------
RAW = json.loads(Path("../dataset/dev_dataset_test.json").read_text())
samples = []
for item in RAW:
for qp in item["quiz_posibility"]:
samp = {
"tokens": [t.lower() for t in item["tokens"]],
"ner": item["ner"],
"srl": item["srl"],
"q_type": qp["type"],
"q_toks": [t.lower() for t in qp["question"]] + ["<eos>"],
}
if isinstance(qp["answer"], list):
samp["a_toks"] = [t.lower() for t in qp["answer"]] + ["<eos>"]
else:
samp["a_toks"] = [qp["answer"].lower(), "<eos>"]
samples.append(samp)
print("Total flattened samples:", len(samples))
# ----------------------- 2. Build vocabularies -----------------------------
def build_vocab(seq_iter, reserved=("<pad>", "<unk>", "<sos>", "<eos>")):
vocab = {tok: idx for idx, tok in enumerate(reserved)}
for tok in chain.from_iterable(seq_iter):
if tok not in vocab:
vocab[tok] = len(vocab)
return vocab
v_tok = build_vocab((s["tokens"] for s in samples))
v_ner = build_vocab((s["ner"] for s in samples), reserved=("<pad>", "<unk>"))
v_srl = build_vocab((s["srl"] for s in samples), reserved=("<pad>", "<unk>"))
v_q = build_vocab((s["q_toks"] for s in samples))
v_a = build_vocab((s["a_toks"] for s in samples))
v_typ = {"isian": 0, "opsi": 1, "true_false": 2}
iv_q = {i: t for t, i in v_q.items()}
iv_a = {i: t for t, i in v_a.items()}
# ----------------------- 3. Vectorise + pad -------------------------------
def encode(seq, vmap):
return [vmap.get(tok, vmap["<unk>"]) for tok in seq]
MAX_SENT = max(len(s["tokens"]) for s in samples)
MAX_Q = max(len(s["q_toks"]) for s in samples)
MAX_A = max(len(s["a_toks"]) for s in samples)
X_tok_ids = pad_sequences([encode(s["tokens"], v_tok) for s in samples],
maxlen=MAX_SENT, padding="post")
X_ner_ids = pad_sequences([encode(s["ner"], v_ner) for s in samples],
maxlen=MAX_SENT, padding="post")
X_srl_ids = pad_sequences([encode(s["srl"], v_srl) for s in samples],
maxlen=MAX_SENT, padding="post")
q_in_ids = pad_sequences([[v_q["<sos>"], *encode(s["q_toks"][:-1], v_q)]
for s in samples], maxlen=MAX_Q, padding="post")
q_out_ids = pad_sequences([encode(s["q_toks"], v_q) for s in samples],
maxlen=MAX_Q, padding="post")
a_in_ids = pad_sequences([[v_a["<sos>"], *encode(s["a_toks"][:-1], v_a)]
for s in samples], maxlen=MAX_A, padding="post")
a_out_ids = pad_sequences([encode(s["a_toks"], v_a) for s in samples],
maxlen=MAX_A, padding="post")
y_type_ids = np.array([v_typ[s["q_type"]] for s in samples])
# ----------------------- 4. Hyperparams ----------------------------------
d_tok = 32 # token embedding dim
d_tag = 16 # NER / SRL embedding dim
units = 64 # per direction of BiLSTM
lat_dim = units * 2
# ----------------------- 5. Build model -----------------------------------
# Encoder ----------------------------------------------------------
tok_in = Input((MAX_SENT,), dtype="int32", name="tok_in")
ner_in = Input((MAX_SENT,), dtype="int32", name="ner_in")
srl_in = Input((MAX_SENT,), dtype="int32", name="srl_in")
emb_tok = Embedding(len(v_tok), d_tok, mask_zero=True, name="emb_tok")(tok_in)
emb_ner = Embedding(len(v_ner), d_tag, mask_zero=True, name="emb_ner")(ner_in)
emb_srl = Embedding(len(v_srl), d_tag, mask_zero=True, name="emb_srl")(srl_in)
enc_concat = Concatenate(name="enc_concat")([emb_tok, emb_ner, emb_srl])
bi_lstm = Bidirectional(LSTM(units, return_sequences=True, return_state=True),
name="encoder_bi_lstm")
enc_seq, f_h, f_c, b_h, b_c = bi_lstm(enc_concat)
enc_h = Concatenate()( [f_h, b_h] ) # (B, lat_dim)
enc_c = Concatenate()( [f_c, b_c] )
# Decoder  QUESTION ----------------------------------------------
q_in = Input((MAX_Q,), dtype="int32", name="q_in")
# 💡 mask_zero=False supaya Attention tidak bentrok dengan mask encoder
q_emb = Embedding(len(v_q), d_tok, mask_zero=False, name="q_emb")(q_in)
dec_q_lstm = LSTM(lat_dim, return_sequences=True, return_state=True,
name="decoder_q_lstm")
q_seq, q_h, q_c = dec_q_lstm(q_emb, initial_state=[enc_h, enc_c])
enc_proj_q = TimeDistributed(Dense(lat_dim), name="enc_proj_q")(enc_seq)
attn_q = Attention(name="attn_q")([q_seq, enc_proj_q])
q_concat = Concatenate(name="q_concat")([q_seq, attn_q])
q_out = TimeDistributed(Dense(len(v_q), activation="softmax"), name="q_out")(q_concat)
# Decoder  ANSWER -------------------------------------------------
a_in = Input((MAX_A,), dtype="int32", name="a_in")
# juga mask_zero=False
a_emb = Embedding(len(v_a), d_tok, mask_zero=False, name="a_emb")(a_in)
dec_a_lstm = LSTM(lat_dim, return_sequences=True, return_state=True,
name="decoder_a_lstm")
a_seq, _, _ = dec_a_lstm(a_emb, initial_state=[q_h, q_c])
enc_proj_a = TimeDistributed(Dense(lat_dim), name="enc_proj_a")(enc_seq)
attn_a = Attention(name="attn_a")([a_seq, enc_proj_a])
a_concat = Concatenate(name="a_concat")([a_seq, attn_a])
a_out = TimeDistributed(Dense(len(v_a), activation="softmax"), name="a_out")(a_concat)
# Classifier -------------------------------------------------------
type_dense = Dense(len(v_typ), activation="softmax", name="type_out")(enc_h)
model = Model(inputs=[tok_in, ner_in, srl_in, q_in, a_in],
outputs=[q_out, a_out, type_dense])
model.summary()
# ----------------------- 6. Compile & train ------------------------------
losses = {
"q_out": "sparse_categorical_crossentropy",
"a_out": "sparse_categorical_crossentropy",
"type_out": "sparse_categorical_crossentropy",
}
loss_weights = {"q_out": 1.0, "a_out": 1.0, "type_out": 0.3}
model.compile(optimizer="adam", loss=losses, loss_weights=loss_weights,
metrics={"q_out": "sparse_categorical_accuracy",
"a_out": "sparse_categorical_accuracy",
"type_out": "accuracy"})
history = model.fit(
[X_tok_ids, X_ner_ids, X_srl_ids, q_in_ids, a_in_ids],
[q_out_ids, a_out_ids, y_type_ids],
validation_split=0.1,
epochs=30,
batch_size=64,
callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],
verbose=1,
)
model.save("seq2seq_attn.keras")
print("Model saved to seq2seq_attn.keras")
# ----------------------- 7. Inference submodels --------------------------
# Encoder model
encoder_model = Model([tok_in, ner_in, srl_in], [enc_seq, enc_h, enc_c])
# Question decoder step model ------------------------------------------------
# Inputs
q_token_in = Input((1,), dtype="int32", name="q_token_in")
enc_seq_in = Input((MAX_SENT, lat_dim), name="enc_seq_in")
enc_proj_q_in = Input((MAX_SENT, lat_dim), name="enc_proj_q_in")
state_h_in = Input((lat_dim,), name="state_h_in")
state_c_in = Input((lat_dim,), name="state_c_in")
# Embedding
q_emb_step = model.get_layer("q_emb")(q_token_in)
# LSTM (reuse weights)
q_lstm_step, h_out, c_out = model.get_layer("decoder_q_lstm")(q_emb_step,
initial_state=[state_h_in, state_c_in])
# Attention
attn_step = model.get_layer("attn_q")([q_lstm_step, enc_proj_q_in])
q_concat_step = Concatenate()([q_lstm_step, attn_step])
q_logits_step = model.get_layer("q_out")(q_concat_step)
decoder_q_step = Model([q_token_in, enc_seq_in, enc_proj_q_in, state_h_in, state_c_in],
[q_logits_step, h_out, c_out])
# Answer decoder step model --------------------------------------------------
a_token_in = Input((1,), dtype="int32", name="a_token_in")
enc_proj_a_in = Input((MAX_SENT, lat_dim), name="enc_proj_a_in")
state_h_a_in = Input((lat_dim,), name="state_h_a_in")
state_c_a_in = Input((lat_dim,), name="state_c_a_in")
# Embedding reuse
a_emb_step = model.get_layer("a_emb")(a_token_in)
# LSTM reuse
a_lstm_step, h_a_out, c_a_out = model.get_layer("decoder_a_lstm")(a_emb_step,
initial_state=[state_h_a_in, state_c_a_in])
# Attention reuse
attn_a_step = model.get_layer("attn_a")([a_lstm_step, enc_proj_a_in])
a_concat_step = Concatenate()([a_lstm_step, attn_a_step])
a_logits_step = model.get_layer("a_out")(a_concat_step)
decoder_a_step = Model([a_token_in, enc_proj_a_in, state_h_a_in, state_c_a_in],
[a_logits_step, h_a_out, c_a_out])
# ----------------------- 8. Decoding helpers ------------------------------
def encode_and_pad(seq, vmap, max_len):
ids = encode(seq, vmap)
return ids + [vmap["<pad>"]] * (max_len - len(ids))
def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):
"""Return generated (question_tokens, answer_tokens, q_type_str)"""
# --- encoder ---------------------------------------------------------
enc_tok = np.array([encode_and_pad(tokens, v_tok, MAX_SENT)])
enc_ner = np.array([encode_and_pad(ner, v_ner, MAX_SENT)])
enc_srl = np.array([encode_and_pad(srl, v_srl, MAX_SENT)])
enc_seq_val, h, c = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)
enc_proj_q_val = model.get_layer("enc_proj_q")(enc_seq_val)
enc_proj_a_val = model.get_layer("enc_proj_a")(enc_seq_val)
# --- greedy Question --------------------------------------------------
q_ids = []
tgt = np.array([[v_q["<sos>"]]])
for _ in range(max_q):
logits, h, c = decoder_q_step.predict([tgt, enc_seq_val, enc_proj_q_val, h, c], verbose=0)
next_id = int(logits[0, 0].argmax())
if next_id == v_q["<eos>"]:
break
q_ids.append(next_id)
tgt = np.array([[next_id]])
# --- reset state for Answer -------------------------------------------
# Use last q_h, q_c (already in h,c)
a_ids = []
tgt_a = np.array([[v_a["<sos>"]]])
for _ in range(max_a):
logits_a, h, c = decoder_a_step.predict([tgt_a, enc_proj_a_val, h, c], verbose=0)
next_a = int(logits_a[0, 0].argmax())
if next_a == v_a["<eos>"]:
break
a_ids.append(next_a)
tgt_a = np.array([[next_a]])
# Question type
typ_logits = model.predict([enc_tok, enc_ner, enc_srl, np.zeros((1, MAX_Q)), np.zeros((1, MAX_A))], verbose=0)[2]
typ_id = int(typ_logits.argmax())
q_type = [k for k, v in v_typ.items() if v == typ_id][0]
question = [iv_q.get(i, "<unk>") for i in q_ids]
answer = [iv_a.get(i, "<unk>") for i in a_ids]
return question, answer, q_type
def beam_decode(tokens, ner, srl, beam_width=5, max_q=20, max_a=10):
"""Beam search decoding. Returns best (question_tokens, answer_tokens, q_type)"""
enc_tok = np.array([encode_and_pad(tokens, v_tok, MAX_SENT)])
enc_ner = np.array([encode_and_pad(ner, v_ner, MAX_SENT)])
enc_srl = np.array([encode_and_pad(srl, v_srl, MAX_SENT)])
enc_seq_val, h0, c0 = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)
enc_proj_q_val = model.get_layer("enc_proj_q")(enc_seq_val)
enc_proj_a_val = model.get_layer("enc_proj_a")(enc_seq_val)
# ----- Beam for Question ----------------------------------------------
Beam = [( [v_q["<sos>"]], 0.0, h0, c0 )] # (sequence, logP, h, c)
completed_q = []
for _ in range(max_q):
new_beam = []
for seq, logp, h, c in Beam:
tgt = np.array([[seq[-1]]])
logits, next_h, next_c = decoder_q_step.predict([tgt, enc_seq_val, enc_proj_q_val, h, c], verbose=0)
log_probs = np.log(logits[0, 0] + 1e-8)
top_ids = np.argsort(log_probs)[-beam_width:]
for nid in top_ids:
new_seq = seq + [int(nid)]
new_logp = logp + log_probs[nid]
new_beam.append( (new_seq, new_logp, next_h, next_c) )
# keep best beam_width
Beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]
# move completed
Beam, done = [], Beam # placeholder copy to modify
for seq, logp, h, c in done:
if seq[-1] == v_q["<eos>"] or len(seq) >= max_q:
completed_q.append( (seq, logp, h, c) )
else:
Beam.append( (seq, logp, h, c) )
if not Beam:
break
if completed_q:
best_q = max(completed_q, key=lambda x: x[1])
else:
best_q = max(Beam, key=lambda x: x[1])
q_seq_ids, _, h_q, c_q = best_q
q_ids = [i for i in q_seq_ids[1:] if i != v_q["<eos>"]]
# ----- Beam for Answer --------------------------------------------------
Beam = [( [v_a["<sos>"]], 0.0, h_q, c_q )]
completed_a = []
for _ in range(max_a):
new_beam = []
for seq, logp, h, c in Beam:
tgt = np.array([[seq[-1]]])
logits, next_h, next_c = decoder_a_step.predict([tgt, enc_proj_a_val, h, c], verbose=0)
log_probs = np.log(logits[0, 0] + 1e-8)
top_ids = np.argsort(log_probs)[-beam_width:]
for nid in top_ids:
new_seq = seq + [int(nid)]
new_logp = logp + log_probs[nid]
new_beam.append( (new_seq, new_logp, next_h, next_c) )
Beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]
Beam, done = [], Beam
for seq, logp, h, c in done:
if seq[-1] == v_a["<eos>"] or len(seq) >= max_a:
completed_a.append( (seq, logp) )
else:
Beam.append( (seq, logp, h, c) )
if not Beam:
break
if completed_a:
best_a_seq, _ = max(completed_a, key=lambda x: x[1])
else:
best_a_seq, _ = max(Beam, key=lambda x: x[1])
a_ids = [i for i in best_a_seq[1:] if i != v_a["<eos>"]]
# Question type classification
typ_logits = model.predict([enc_tok, enc_ner, enc_srl, np.zeros((1, MAX_Q)), np.zeros((1, MAX_A))], verbose=0)[2]
typ_id = int(typ_logits.argmax())
q_type = [k for k, v in v_typ.items() if v == typ_id][0]
question = [iv_q.get(i, "<unk>") for i in q_ids]
answer = [iv_a.get(i, "<unk>") for i in a_ids]
return question, answer, q_type
# ----------------------- 9. BLEU evaluation -------------------------------
def evaluate_bleu(split_ratio=0.1, beam=False):
"""Compute corpus BLEU4 on holdout split."""
n_total = len(samples)
n_val = int(n_total * split_ratio)
idxs = np.random.choice(n_total, n_val, replace=False)
refs_q, hyps_q = [], []
refs_a, hyps_a = [], []
for i in idxs:
s = samples[i]
question_pred, answer_pred, _ = (beam_decode if beam else greedy_decode)(
s["tokens"], s["ner"], s["srl"],
)
refs_q.append([s["q_toks"][:-1]]) # exclude <eos>
hyps_q.append(question_pred)
refs_a.append([s["a_toks"][:-1]])
hyps_a.append(answer_pred)
bleu_q = corpus_bleu(refs_q, hyps_q)
bleu_a = corpus_bleu(refs_a, hyps_a)
print(f"BLEU4 Question: {bleu_q:.3f}\nBLEU4 Answer : {bleu_a:.3f}")
# Example usage (uncomment):
evaluate_bleu(beam=False)
evaluate_bleu(beam=True)