Compare commits
10 Commits
a0f68a3c1b
...
668e659147
Author | SHA1 | Date |
---|---|---|
|
668e659147 | |
|
74b7dd177b | |
|
fc640c9017 | |
|
fd119e4a6a | |
|
6851390ea2 | |
|
f0f6f412bb | |
|
ad4b6d6137 | |
|
1fd5a3dc95 | |
|
011b22e262 | |
|
3cf689159c |
|
@ -3,5 +3,8 @@ myenv
|
|||
*keras*
|
||||
**/*keras*
|
||||
|
||||
|
||||
*h5*
|
||||
**/*h5*
|
||||
# Abaikan semua file dengan ekstensi .pkl
|
||||
*.pkl
|
||||
|
|
Before Width: | Height: | Size: 66 KiB After Width: | Height: | Size: 62 KiB |
|
@ -8,20 +8,97 @@ from pathlib import Path
|
|||
# Daftar label NER yang valid (bisa disesuaikan)
|
||||
VALID_NER_LABELS = {
|
||||
"O",
|
||||
"B-LOC", "I-LOC",
|
||||
"B-PER", "I-PER",
|
||||
"B-ORG", "I-ORG",
|
||||
"B-DATE", "I-DATE",
|
||||
"B-TIME", "I-TIME",
|
||||
"B-EVENT", "I-EVENT"
|
||||
"LOC",
|
||||
"LOC",
|
||||
"PER",
|
||||
"PER",
|
||||
"ORG",
|
||||
"ORG",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"TIME",
|
||||
"TIME",
|
||||
"EVENT",
|
||||
"EVENT",
|
||||
"MISC",
|
||||
}
|
||||
|
||||
|
||||
# Daftar label NER yang valid (bisa disesuaikan)
|
||||
VALID_NER_LABELS = {"O", "LOC", "PER", "ORG", "DATE", "TIME", "EVENT", "MISC"}
|
||||
|
||||
# Daftar label SRL yang valid
|
||||
VALID_SRL_LABELS = {
|
||||
"ARG0",
|
||||
"ARG1",
|
||||
"ARG2",
|
||||
"ARG3",
|
||||
"ARGM-TMP",
|
||||
"ARGM-LOC",
|
||||
"ARGM-CAU",
|
||||
"ARGM-MNR",
|
||||
"ARGM-MOD",
|
||||
"ARGM-NEG",
|
||||
"V",
|
||||
"O",
|
||||
}
|
||||
|
||||
|
||||
# def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
||||
# with open(json_path, encoding="utf-8") as f:
|
||||
# records = json.load(f)
|
||||
|
||||
# seen_sentences: set[tuple[str, ...]] = set()
|
||||
|
||||
# with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
|
||||
# writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
|
||||
|
||||
# for idx, rec in enumerate(records):
|
||||
# contexxt = rec.get("context")
|
||||
# tokens = rec.get("tokens")
|
||||
# ner_tags = rec.get("ner")
|
||||
# srl_tags = rec.get("srl")
|
||||
|
||||
# if not (len(tokens) == len(ner_tags) == len(srl_tags)):
|
||||
# raise ValueError(
|
||||
# f"❌ Panjang tidak sama di record index {idx}:\n"
|
||||
# f" context ({len(contexxt)}): {contexxt}\n"
|
||||
# f" tokens ({len(tokens)}): {tokens}\n"
|
||||
# f" ner ({len(ner_tags)}): {ner_tags}\n"
|
||||
# f" srl ({len(srl_tags)}): {srl_tags}\n"
|
||||
# )
|
||||
|
||||
# # Validasi label NER
|
||||
# for i, ner_label in enumerate(ner_tags):
|
||||
# if ner_label not in VALID_NER_LABELS:
|
||||
# raise ValueError(
|
||||
# f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
# f" ner_label: {ner_label}\n"
|
||||
# f" value: {tokens}"
|
||||
# )
|
||||
|
||||
# # Validasi label SRL
|
||||
# for i, srl_label in enumerate(srl_tags):
|
||||
# if srl_label not in VALID_SRL_LABELS:
|
||||
# raise ValueError(
|
||||
# f"❌ Label SRL tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
# f" srl_label: {srl_label}\n"
|
||||
# f" value: {tokens}"
|
||||
# )
|
||||
|
||||
# key = tuple(tokens)
|
||||
# if key in seen_sentences:
|
||||
# continue
|
||||
# seen_sentences.add(key)
|
||||
|
||||
# for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
|
||||
# writer.writerow([tok, ner, srl])
|
||||
# writer.writerow([])
|
||||
|
||||
# print(f"✔️ TSV selesai, simpan di: {tsv_path}")
|
||||
|
||||
|
||||
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
||||
"""
|
||||
Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
|
||||
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
|
||||
Jika ada record yang tokens, ner, dan srl tidak sama panjang, atau ada label NER tidak valid, akan diberi info error lengkap.
|
||||
"""
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
records = json.load(f)
|
||||
|
||||
|
@ -31,26 +108,46 @@ def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
|||
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
|
||||
|
||||
for idx, rec in enumerate(records):
|
||||
context = rec.get("context")
|
||||
tokens = rec.get("tokens")
|
||||
ner_tags = rec.get("ner")
|
||||
srl_tags = rec.get("srl")
|
||||
|
||||
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
|
||||
raise ValueError(
|
||||
print(
|
||||
f"❌ Panjang tidak sama di record index {idx}:\n"
|
||||
f" context: {context}\n"
|
||||
f" tokens ({len(tokens)}): {tokens}\n"
|
||||
f" ner ({len(ner_tags)}): {ner_tags}\n"
|
||||
f" srl ({len(srl_tags)}): {srl_tags}\n"
|
||||
)
|
||||
continue
|
||||
|
||||
# Validasi label NER
|
||||
invalid_ner = False
|
||||
for i, ner_label in enumerate(ner_tags):
|
||||
if ner_label not in VALID_NER_LABELS:
|
||||
raise ValueError(
|
||||
print(
|
||||
f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
f" ner_label: {ner_label}\n"
|
||||
f" value: {tokens}"
|
||||
)
|
||||
invalid_ner = True
|
||||
break
|
||||
if invalid_ner:
|
||||
continue
|
||||
|
||||
invalid_srl = False
|
||||
for i, srl_label in enumerate(srl_tags):
|
||||
if srl_label not in VALID_SRL_LABELS:
|
||||
print(
|
||||
f"❌ Label SRL tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
||||
f" srl_label: {srl_label}\n"
|
||||
f" value: {tokens}"
|
||||
)
|
||||
invalid_srl = True
|
||||
break
|
||||
if invalid_srl:
|
||||
continue
|
||||
|
||||
key = tuple(tokens)
|
||||
if key in seen_sentences:
|
||||
|
@ -118,4 +215,4 @@ def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
|||
# CONTOH PEMAKAIAN
|
||||
# ---------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
json_to_tsv("QC/normalize_dataset.json", "QC/new_LNS.tsv")
|
||||
json_to_tsv("../dataset/stable_qg_qa_train_dataset.json", "new_LNS_2.tsv")
|
Before Width: | Height: | Size: 66 KiB After Width: | Height: | Size: 55 KiB |
|
@ -50,6 +50,6 @@ def predict_sentence(sentence: str) -> dict:
|
|||
# 3. Demo
|
||||
# -----------------------------
|
||||
if __name__ == "__main__":
|
||||
sample = "batu bata terbuat dari material tanah liat"
|
||||
sample = "ngaben adalan acara pembakaran jenazah masyarakat suku bali"
|
||||
result = predict_sentence(sample)
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
|
|
|
@ -1,554 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "9bf2159a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-05-02 15:16:40.916818: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
||||
"2025-05-02 15:16:40.923426: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-05-02 15:16:40.983217: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-05-02 15:16:41.024477: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
||||
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
||||
"E0000 00:00:1746173801.069646 9825 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
||||
"E0000 00:00:1746173801.081087 9825 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
||||
"W0000 00:00:1746173801.169376 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746173801.169393 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746173801.169395 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746173801.169396 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"2025-05-02 15:16:41.179508: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
||||
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import numpy as np\n",
|
||||
"from pathlib import Path\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
||||
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
||||
"from tensorflow.keras.utils import to_categorical\n",
|
||||
"\n",
|
||||
"from tensorflow.keras.models import Model\n",
|
||||
"from tensorflow.keras.layers import (\n",
|
||||
" Input,\n",
|
||||
" Embedding,\n",
|
||||
" LSTM,\n",
|
||||
" Concatenate,\n",
|
||||
" Dense,\n",
|
||||
" TimeDistributed,\n",
|
||||
")\n",
|
||||
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"from collections import Counter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "50118278",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
" Jumlah data valid: 321 / 321\n",
|
||||
" Jumlah data tidak valid: 0\n",
|
||||
"Counter({'ftb': 235, 'tof': 45, 'none': 41})\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Load raw data\n",
|
||||
"with open(\"normalize_dataset.json\", encoding=\"utf-8\") as f:\n",
|
||||
" raw_data = json.load(f)\n",
|
||||
"\n",
|
||||
"# Validasi lengkap\n",
|
||||
"required_keys = {\"tokens\", \"ner\", \"srl\", \"question\", \"answer\", \"type\"}\n",
|
||||
"valid_data = []\n",
|
||||
"invalid_data = []\n",
|
||||
"\n",
|
||||
"for idx, item in enumerate(raw_data):\n",
|
||||
" error_messages = []\n",
|
||||
"\n",
|
||||
" if not isinstance(item, dict):\n",
|
||||
" error_messages.append(\"bukan dictionary\")\n",
|
||||
"\n",
|
||||
" missing_keys = required_keys - item.keys()\n",
|
||||
" if missing_keys:\n",
|
||||
" error_messages.append(f\"missing keys: {missing_keys}\")\n",
|
||||
"\n",
|
||||
" if not error_messages:\n",
|
||||
" # Cek tipe data dan None\n",
|
||||
" if (not isinstance(item[\"tokens\"], list) or\n",
|
||||
" not isinstance(item[\"ner\"], list) or\n",
|
||||
" not isinstance(item[\"srl\"], list) or\n",
|
||||
" not isinstance(item[\"question\"], list) or\n",
|
||||
" not isinstance(item[\"answer\"], list) or\n",
|
||||
" not isinstance(item[\"type\"], str)):\n",
|
||||
" error_messages.append(\"field type tidak sesuai\")\n",
|
||||
" \n",
|
||||
" if error_messages:\n",
|
||||
" print(f\"\\n Index {idx} | Masalah: {', '.join(error_messages)}\")\n",
|
||||
" print(json.dumps(item, indent=2, ensure_ascii=False))\n",
|
||||
" invalid_data.append(item)\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" valid_data.append(item)\n",
|
||||
"\n",
|
||||
"# Statistik\n",
|
||||
"print(f\"\\n Jumlah data valid: {len(valid_data)} / {len(raw_data)}\")\n",
|
||||
"print(f\" Jumlah data tidak valid: {len(invalid_data)}\")\n",
|
||||
"\n",
|
||||
"# Proses data valid\n",
|
||||
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in valid_data]\n",
|
||||
"ner_tags = [item[\"ner\"] for item in valid_data]\n",
|
||||
"srl_tags = [item[\"srl\"] for item in valid_data]\n",
|
||||
"questions = [[token.lower().strip() for token in item[\"question\"]] for item in valid_data]\n",
|
||||
"answers = [[token.lower().strip() for token in item[\"answer\"]] for item in valid_data]\n",
|
||||
"types = [item[\"type\"] for item in valid_data]\n",
|
||||
"\n",
|
||||
"type_counts = Counter(types)\n",
|
||||
"\n",
|
||||
"print(type_counts)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "4e3a0088",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# tokenize\n",
|
||||
"token_tok = Tokenizer(lower=False, oov_token=\"UNK\")\n",
|
||||
"token_ner = Tokenizer(lower=False)\n",
|
||||
"token_srl = Tokenizer(lower=False)\n",
|
||||
"token_q = Tokenizer(lower=False)\n",
|
||||
"token_a = Tokenizer(lower=False)\n",
|
||||
"token_type = Tokenizer(lower=False)\n",
|
||||
"\n",
|
||||
"token_tok.fit_on_texts(tokens)\n",
|
||||
"token_ner.fit_on_texts(ner_tags)\n",
|
||||
"token_srl.fit_on_texts(srl_tags)\n",
|
||||
"token_q.fit_on_texts(questions)\n",
|
||||
"token_a.fit_on_texts(answers)\n",
|
||||
"token_type.fit_on_texts(types)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"maxlen = 20"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "555f9e22",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'ftb', 'tof', 'none'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"X_tok = pad_sequences(\n",
|
||||
" token_tok.texts_to_sequences(tokens), padding=\"post\", maxlen=maxlen\n",
|
||||
")\n",
|
||||
"X_ner = pad_sequences(\n",
|
||||
" token_ner.texts_to_sequences(ner_tags), padding=\"post\", maxlen=maxlen\n",
|
||||
")\n",
|
||||
"X_srl = pad_sequences(\n",
|
||||
" token_srl.texts_to_sequences(srl_tags), padding=\"post\", maxlen=maxlen\n",
|
||||
")\n",
|
||||
"y_q = pad_sequences(token_q.texts_to_sequences(questions), padding=\"post\", maxlen=maxlen)\n",
|
||||
"y_a = pad_sequences(token_a.texts_to_sequences(answers), padding=\"post\", maxlen=maxlen)\n",
|
||||
"\n",
|
||||
"print(set(types))\n",
|
||||
"\n",
|
||||
"y_type = [seq[0] for seq in token_type.texts_to_sequences(types)] # list of int\n",
|
||||
"y_type = to_categorical(np.array(y_type) - 1, num_classes=len(token_type.word_index))\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "f530cfe7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_tok_train, X_tok_test, X_ner_train, X_ner_test, X_srl_train, X_srl_test, \\\n",
|
||||
"y_q_train, y_q_test, y_a_train, y_a_test, y_type_train, y_type_test = train_test_split(\n",
|
||||
" X_tok, X_ner, X_srl, y_q, y_a, y_type, test_size=0.2, random_state=42\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"X_train = [X_tok_train, X_ner_train, X_srl_train]\n",
|
||||
"X_test = [X_tok_test, X_ner_test, X_srl_test]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "255e2a9a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-04-29 19:13:22.481835: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional\"</span>\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1mModel: \"functional\"\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ tok_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">126,080</span> │ tok_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">352</span> │ ner_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_2 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">432</span> │ srl_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">160</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ embedding[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>) │ │ │ embedding_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
||||
"│ │ │ │ embedding_2[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">427,008</span> │ concatenate[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ get_item (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">GetItem</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ question_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">473</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">121,561</span> │ lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ answer_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">383</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">98,431</span> │ lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ type_output (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">3</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">771</span> │ get_item[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ tok_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m126,080\u001b[0m │ tok_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m352\u001b[0m │ ner_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_2 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m432\u001b[0m │ srl_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m160\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ embedding[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
|
||||
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ embedding_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
|
||||
"│ │ │ │ embedding_2[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm (\u001b[38;5;33mLSTM\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m427,008\u001b[0m │ concatenate[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ get_item (\u001b[38;5;33mGetItem\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ question_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m473\u001b[0m) │ \u001b[38;5;34m121,561\u001b[0m │ lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ answer_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m383\u001b[0m) │ \u001b[38;5;34m98,431\u001b[0m │ lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ type_output (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m) │ \u001b[38;5;34m771\u001b[0m │ get_item[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">774,635</span> (2.95 MB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m774,635\u001b[0m (2.95 MB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">774,635</span> (2.95 MB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m774,635\u001b[0m (2.95 MB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 1/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 101ms/step - answer_output_accuracy: 0.5626 - answer_output_loss: 5.7629 - loss: 12.9112 - question_output_accuracy: 0.3867 - question_output_loss: 6.0185 - type_output_accuracy: 0.5290 - type_output_loss: 1.0943 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 3.9036 - val_loss: 9.5865 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 4.5947 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0883\n",
|
||||
"Epoch 2/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8791 - answer_output_loss: 2.9526 - loss: 7.7800 - question_output_accuracy: 0.6837 - question_output_loss: 3.7162 - type_output_accuracy: 0.7148 - type_output_loss: 1.0672 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 1.1139 - val_loss: 4.1230 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9489 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0601\n",
|
||||
"Epoch 3/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 34ms/step - answer_output_accuracy: 0.8726 - answer_output_loss: 1.2047 - loss: 4.4213 - question_output_accuracy: 0.6797 - question_output_loss: 2.2016 - type_output_accuracy: 0.7251 - type_output_loss: 1.0092 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.7679 - val_loss: 3.7423 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9604 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0140\n",
|
||||
"Epoch 4/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8633 - answer_output_loss: 1.1478 - loss: 4.4374 - question_output_accuracy: 0.6639 - question_output_loss: 2.3671 - type_output_accuracy: 0.7490 - type_output_loss: 0.9088 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.7059 - val_loss: 3.6255 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9356 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 0.9840\n",
|
||||
"Epoch 5/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 33ms/step - answer_output_accuracy: 0.8783 - answer_output_loss: 1.0187 - loss: 4.0230 - question_output_accuracy: 0.6760 - question_output_loss: 2.1959 - type_output_accuracy: 0.7563 - type_output_loss: 0.8131 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6848 - val_loss: 3.5743 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.9039 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 0.9857\n",
|
||||
"Epoch 6/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 33ms/step - answer_output_accuracy: 0.8800 - answer_output_loss: 0.9845 - loss: 3.8171 - question_output_accuracy: 0.6878 - question_output_loss: 2.0357 - type_output_accuracy: 0.7328 - type_output_loss: 0.7942 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6742 - val_loss: 3.5592 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8777 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0074\n",
|
||||
"Epoch 7/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8768 - answer_output_loss: 0.9756 - loss: 3.8569 - question_output_accuracy: 0.6743 - question_output_loss: 2.0795 - type_output_accuracy: 0.7030 - type_output_loss: 0.8039 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6769 - val_loss: 3.5671 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8631 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0272\n",
|
||||
"Epoch 8/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 32ms/step - answer_output_accuracy: 0.8814 - answer_output_loss: 0.9217 - loss: 3.7726 - question_output_accuracy: 0.6798 - question_output_loss: 2.0253 - type_output_accuracy: 0.6785 - type_output_loss: 0.8194 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6900 - val_loss: 3.5722 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8469 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0354\n",
|
||||
"Epoch 9/30\n",
|
||||
"\u001b[1m7/7\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 31ms/step - answer_output_accuracy: 0.8703 - answer_output_loss: 0.9799 - loss: 3.6985 - question_output_accuracy: 0.6843 - question_output_loss: 1.9755 - type_output_accuracy: 0.7160 - type_output_loss: 0.7474 - val_answer_output_accuracy: 0.9261 - val_answer_output_loss: 0.6958 - val_loss: 3.5849 - val_question_output_accuracy: 0.7500 - val_question_output_loss: 1.8401 - val_type_output_accuracy: 0.5652 - val_type_output_loss: 1.0490\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"inp_tok = Input(shape=(None,), name=\"tok_input\")\n",
|
||||
"inp_ner = Input(shape=(None,), name=\"ner_input\")\n",
|
||||
"inp_srl = Input(shape=(None,), name=\"srl_input\")\n",
|
||||
"\n",
|
||||
"emb_tok = Embedding(input_dim=len(token_tok.word_index) + 1, output_dim=128)(inp_tok)\n",
|
||||
"emb_ner = Embedding(input_dim=len(token_ner.word_index) + 1, output_dim=16)(inp_ner)\n",
|
||||
"emb_srl = Embedding(input_dim=len(token_srl.word_index) + 1, output_dim=16)(inp_srl)\n",
|
||||
"\n",
|
||||
"# emb_tok = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_tok)\n",
|
||||
"# emb_ner = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_ner)\n",
|
||||
"# emb_srl = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_srl)\n",
|
||||
"\n",
|
||||
"merged = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
|
||||
"\n",
|
||||
"x = LSTM(256, return_sequences=True)(merged)\n",
|
||||
"\n",
|
||||
"out_question = TimeDistributed(Dense(len(token_q.word_index) + 1, activation=\"softmax\"), name=\"question_output\")(x)\n",
|
||||
"out_answer = TimeDistributed(Dense(len(token_a.word_index) + 1, activation=\"softmax\"), name=\"answer_output\")(x)\n",
|
||||
"out_type = Dense(len(token_type.word_index), activation=\"softmax\", name=\"type_output\")(\n",
|
||||
" x[:, 0, :]\n",
|
||||
") # gunakan step pertama\n",
|
||||
"\n",
|
||||
"model = Model(\n",
|
||||
" inputs=[inp_tok, inp_ner, inp_srl], outputs=[out_question, out_answer, out_type]\n",
|
||||
")\n",
|
||||
"model.compile(\n",
|
||||
" optimizer=\"adam\",\n",
|
||||
" loss={\n",
|
||||
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
|
||||
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
|
||||
" \"type_output\": \"categorical_crossentropy\",\n",
|
||||
" },\n",
|
||||
" metrics={\n",
|
||||
" \"question_output\": \"accuracy\",\n",
|
||||
" \"answer_output\": \"accuracy\",\n",
|
||||
" \"type_output\": \"accuracy\",\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model.summary()\n",
|
||||
"\n",
|
||||
"# ----------------------------------------------------------------------------\n",
|
||||
"# 5. TRAINING\n",
|
||||
"# ----------------------------------------------------------------------------\n",
|
||||
"model.fit(\n",
|
||||
" X_train,\n",
|
||||
" {\n",
|
||||
" \"question_output\": np.expand_dims(y_q_train, -1),\n",
|
||||
" \"answer_output\": np.expand_dims(y_a_train, -1),\n",
|
||||
" \"type_output\": y_type_train,\n",
|
||||
" },\n",
|
||||
" batch_size=32,\n",
|
||||
" epochs=30,\n",
|
||||
" validation_split=0.1,\n",
|
||||
" callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model.save(\"new_model_lstm_qg.keras\")\n",
|
||||
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
|
||||
" pickle.dump({\n",
|
||||
" \"token\": token_tok,\n",
|
||||
" \"ner\": token_ner,\n",
|
||||
" \"srl\": token_srl,\n",
|
||||
" \"question\": token_q,\n",
|
||||
" \"answer\": token_a,\n",
|
||||
" \"type\": token_type\n",
|
||||
" }, f)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "06fd86c7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 236ms/step\n",
|
||||
"\n",
|
||||
"=== Akurasi Detail ===\n",
|
||||
"Question Accuracy (Token-level): 0.0000\n",
|
||||
"Answer Accuracy (Token-level) : 0.0000\n",
|
||||
"Type Accuracy (Class-level) : 0.68\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"def token_level_accuracy(y_true, y_pred):\n",
|
||||
" correct = 0\n",
|
||||
" total = 0\n",
|
||||
" for true_seq, pred_seq in zip(y_true, y_pred):\n",
|
||||
" for t, p in zip(true_seq, pred_seq):\n",
|
||||
" if t != 0: # ignore padding\n",
|
||||
" total += 1\n",
|
||||
" if t == p:\n",
|
||||
" correct += 1\n",
|
||||
" return correct / total if total > 0 else 0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Predict on test set\n",
|
||||
"y_pred_q, y_pred_a, y_pred_type = model.predict(X_test)\n",
|
||||
"\n",
|
||||
"# Decode predictions to class indices\n",
|
||||
"y_pred_q = np.argmax(y_pred_q, axis=-1)\n",
|
||||
"y_pred_a = np.argmax(y_pred_a, axis=-1)\n",
|
||||
"y_pred_type = np.argmax(y_pred_type, axis=-1)\n",
|
||||
"y_true_type = np.argmax(y_type_test, axis=-1)\n",
|
||||
"\n",
|
||||
"# Calculate token-level accuracy\n",
|
||||
"acc_q = token_level_accuracy(y_q_test, y_pred_q)\n",
|
||||
"acc_a = token_level_accuracy(y_a_test, y_pred_a)\n",
|
||||
"\n",
|
||||
"# Type classification report\n",
|
||||
"report_type = classification_report(y_true_type, y_pred_type, zero_division=0)\n",
|
||||
"\n",
|
||||
"# Print Results\n",
|
||||
"print(\"\\n=== Akurasi Detail ===\")\n",
|
||||
"print(f\"Question Accuracy (Token-level): {acc_q:.4f}\")\n",
|
||||
"print(f\"Answer Accuracy (Token-level) : {acc_a:.4f}\")\n",
|
||||
"print(f\"Type Accuracy (Class-level) : {np.mean(y_true_type == y_pred_type):.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "d5ed106c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# flat_true_a, flat_pred_a = flatten_valid(y_a_test, y_pred_a_class)\n",
|
||||
"# print(\"\\n=== Classification Report: ANSWER ===\")\n",
|
||||
"# print(classification_report(flat_true_a, flat_pred_a))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "aa3860de",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# print(\"\\n=== Classification Report: TYPE ===\")\n",
|
||||
"# print(classification_report(y_true_type_class, y_pred_type_class))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "myenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
import json
|
||||
|
||||
def validate_and_sort_data(input_data, valid_output_file='valid_data.json', invalid_output_file='invalid_data.json'):
|
||||
"""
|
||||
Memvalidasi dan mengurutkan data berdasarkan konsistensi panjang token, NER, dan SRL
|
||||
|
||||
Args:
|
||||
input_data: List data yang akan divalidasi
|
||||
valid_output_file: Nama file untuk data yang valid
|
||||
invalid_output_file: Nama file untuk data yang tidak valid
|
||||
"""
|
||||
|
||||
valid_data = []
|
||||
invalid_data = []
|
||||
|
||||
for i, item in enumerate(input_data):
|
||||
# Cek apakah semua field yang diperlukan ada
|
||||
required_fields = ['context', 'tokens', 'ner', 'srl']
|
||||
missing_fields = [field for field in required_fields if field not in item]
|
||||
|
||||
if missing_fields:
|
||||
item['validation_error'] = f"Missing fields: {missing_fields}"
|
||||
invalid_data.append(item)
|
||||
continue
|
||||
|
||||
# Cek panjang konsistensi
|
||||
tokens_len = len(item['tokens'])
|
||||
ner_len = len(item['ner'])
|
||||
srl_len = len(item['srl'])
|
||||
|
||||
# Validasi panjang
|
||||
if tokens_len == ner_len == srl_len:
|
||||
# Data valid
|
||||
item['validation_status'] = 'valid'
|
||||
item['token_count'] = tokens_len
|
||||
valid_data.append(item)
|
||||
else:
|
||||
# Data tidak valid
|
||||
item['validation_status'] = 'invalid'
|
||||
item['validation_error'] = {
|
||||
'tokens_length': tokens_len,
|
||||
'ner_length': ner_len,
|
||||
'srl_length': srl_len,
|
||||
'issue': 'Length mismatch between tokens, NER, and SRL'
|
||||
}
|
||||
invalid_data.append(item)
|
||||
|
||||
# Urutkan data valid berdasarkan jumlah token (ascending)
|
||||
valid_data.sort(key=lambda x: x['token_count'])
|
||||
|
||||
# Simpan ke file JSON
|
||||
with open(valid_output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(valid_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
with open(invalid_output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(invalid_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Print statistik
|
||||
print(f"=== HASIL VALIDASI DATA ===")
|
||||
print(f"Total data: {len(input_data)}")
|
||||
print(f"Data valid: {len(valid_data)}")
|
||||
print(f"Data tidak valid: {len(invalid_data)}")
|
||||
print(f"\nFile output:")
|
||||
print(f"- Data valid: {valid_output_file}")
|
||||
print(f"- Data tidak valid: {invalid_output_file}")
|
||||
|
||||
if invalid_data:
|
||||
print(f"\n=== DETAIL DATA TIDAK VALID ===")
|
||||
for i, item in enumerate(invalid_data):
|
||||
if 'validation_error' in item:
|
||||
if isinstance(item['validation_error'], dict):
|
||||
error = item['validation_error']
|
||||
print(f"Data {i+1}: {error['issue']}")
|
||||
print(f" - Tokens: {error['tokens_length']}")
|
||||
print(f" - NER: {error['ner_length']}")
|
||||
print(f" - SRL: {error['srl_length']}")
|
||||
else:
|
||||
print(f"Data {i+1}: {item['validation_error']}")
|
||||
print()
|
||||
|
||||
return valid_data, invalid_data
|
||||
|
||||
def load_data_from_file(file_path):
|
||||
"""
|
||||
Memuat data dari file JSON
|
||||
|
||||
Args:
|
||||
file_path: Path ke file JSON
|
||||
|
||||
Returns:
|
||||
List data
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"File {file_path} tidak ditemukan!")
|
||||
return []
|
||||
except json.JSONDecodeError:
|
||||
print(f"Error parsing JSON dari file {file_path}")
|
||||
return []
|
||||
|
||||
# Contoh penggunaan
|
||||
if __name__ == "__main__":
|
||||
# Data contoh dari input Anda
|
||||
sample_data = [
|
||||
{
|
||||
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
||||
"tokens": [
|
||||
"raden", "ajeng", "kartini", "lahir", "pada", "21",
|
||||
"april", "1879", "di", "jepara"
|
||||
],
|
||||
"ner": ["PER", "PER", "PER", "O", "O", "DATE", "DATE", "DATE", "O", "LOC"],
|
||||
"srl": [
|
||||
"ARG0", "ARG0", "ARG0", "V", "O", "ARGM-TMP",
|
||||
"ARGM-TMP", "ARGM-TMP", "O", "ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "isian",
|
||||
"question": "Dimana kartini lahir ___",
|
||||
"answer": "jepara",
|
||||
"id": "qa_0_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
||||
"options": ["true", "false"],
|
||||
"answer": "false",
|
||||
"id": "qa_0_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
"tokens": [
|
||||
"kerajaan", "majapahit", "berdiri", "pada",
|
||||
"tahun", "1293", "di", "trowulan"
|
||||
],
|
||||
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
|
||||
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
|
||||
"qas": [
|
||||
{
|
||||
"type": "opsi",
|
||||
"question": "Dimana kerajaan majapahit berdiri ___",
|
||||
"options": ["trowulan", "singasari", "kuta", "banten"],
|
||||
"answer": "trowulan",
|
||||
"id": "qa_1_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
||||
"options": ["true", "false"],
|
||||
"answer": "false",
|
||||
"id": "qa_1_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
# Contoh data tidak valid (panjang tidak sama)
|
||||
{
|
||||
"context": "contoh data tidak valid",
|
||||
"tokens": ["contoh", "data", "tidak"],
|
||||
"ner": ["O", "O"], # Panjang tidak sama dengan tokens
|
||||
"srl": ["ARG0", "ARG1", "V", "O"], # Panjang tidak sama dengan tokens
|
||||
"qas": []
|
||||
}
|
||||
]
|
||||
|
||||
# Jalankan validasi
|
||||
# valid, invalid = validate_and_sort_data(sample_data)
|
||||
|
||||
# Atau jika ingin memuat dari file:
|
||||
data = load_data_from_file('need_clean_dataset.json')
|
||||
valid, invalid = validate_and_sort_data(data)
|
|
@ -0,0 +1,3 @@
|
|||
[
|
||||
"B-PER"
|
||||
]
|
|
@ -0,0 +1,329 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "94d3889b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
||||
"2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
||||
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
||||
"E0000 00:00:1746863381.035097 166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
||||
"E0000 00:00:1746863381.038978 166971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
||||
"W0000 00:00:1746863381.049265 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746863381.049288 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746863381.049289 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746863381.049290 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"2025-05-10 14:49:41.052642: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
||||
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -------------------------------------------------\n",
|
||||
"# 0. Import & Konfigurasi\n",
|
||||
"# -------------------------------------------------\n",
|
||||
"import json, pickle\n",
|
||||
"import numpy as np\n",
|
||||
"from pathlib import Path\n",
|
||||
"from collections import Counter\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
||||
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
||||
"from tensorflow.keras.utils import to_categorical\n",
|
||||
"from tensorflow.keras.layers import (\n",
|
||||
" Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,\n",
|
||||
" TimeDistributed\n",
|
||||
")\n",
|
||||
"from tensorflow.keras.models import Model\n",
|
||||
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = \"<PAD>\"\n",
|
||||
"UNK_TOKEN = \"UNK\"\n",
|
||||
"START_TOKEN = \"<START>\"\n",
|
||||
"END_TOKEN = \"<END>\"\n",
|
||||
"MAXLEN_SRC = 100 # Panjang paragraf maksimal\n",
|
||||
"MAXLEN_TGT = 40 # Panjang pertanyaan/jawaban maksimal\n",
|
||||
"BATCH = 32\n",
|
||||
"EPOCHS = 30"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "b528b34e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Valid 325 / 325 (invalid index: [])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"raw = json.loads(Path(\"normalize_dataset.json\").read_text(encoding=\"utf-8\"))\n",
|
||||
"\n",
|
||||
"req = {\"tokens\",\"ner\",\"srl\",\"question\",\"answer\",\"type\"}\n",
|
||||
"valid, bad = [], []\n",
|
||||
"for i,item in enumerate(raw):\n",
|
||||
" if (isinstance(item,dict) and not (req-item.keys())\n",
|
||||
" and all(isinstance(item[k],list) for k in req-{\"type\"})\n",
|
||||
" and isinstance(item[\"type\"],str)):\n",
|
||||
" valid.append(item)\n",
|
||||
" else:\n",
|
||||
" bad.append(i)\n",
|
||||
"\n",
|
||||
"print(f\"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b18e4617",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for ex in valid:\n",
|
||||
" ex[\"question_in\"] = [START_TOKEN] + ex[\"question\"]\n",
|
||||
" ex[\"question_out\"] = ex[\"question\"] + [END_TOKEN]\n",
|
||||
"\n",
|
||||
" ex[\"answer_in\"] = [START_TOKEN] + ex[\"answer\"]\n",
|
||||
" ex[\"answer_out\"] = ex[\"answer\"] + [END_TOKEN]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "faa30b82",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tok_token = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
|
||||
"tok_ner = Tokenizer(lower=False, filters=\"\")\n",
|
||||
"tok_srl = Tokenizer(lower=False, filters=\"\")\n",
|
||||
"tok_q = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
|
||||
"tok_a = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
|
||||
"tok_type = Tokenizer(lower=False, filters=\"\")\n",
|
||||
"\n",
|
||||
"tok_token.fit_on_texts([ex[\"tokens\"] for ex in valid])\n",
|
||||
"tok_ner.fit_on_texts([ex[\"ner\"] for ex in valid])\n",
|
||||
"tok_srl.fit_on_texts([ex[\"srl\"] for ex in valid])\n",
|
||||
"tok_q.fit_on_texts([ex[\"question_in\"]+ex[\"question_out\"] for ex in valid])\n",
|
||||
"tok_a.fit_on_texts([ex[\"answer_in\"]+ex[\"answer_out\"] for ex in valid])\n",
|
||||
"tok_type.fit_on_texts([ex[\"type\"] for ex in valid])\n",
|
||||
"\n",
|
||||
"# +1 utk padding\n",
|
||||
"vocab_token = len(tok_token.word_index)+1\n",
|
||||
"vocab_ner = len(tok_ner.word_index)+1\n",
|
||||
"vocab_srl = len(tok_srl.word_index)+1\n",
|
||||
"vocab_q = len(tok_q.word_index)+1\n",
|
||||
"vocab_a = len(tok_a.word_index)+1\n",
|
||||
"vocab_type = len(tok_type.word_index)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "c83ce734",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def seqs(field, tok, maxlen):\n",
|
||||
" return pad_sequences(\n",
|
||||
" tok.texts_to_sequences([ex[field] for ex in valid]),\n",
|
||||
" maxlen=maxlen, padding=\"post\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"X_tok = seqs(\"tokens\", tok_token, MAXLEN_SRC)\n",
|
||||
"X_ner = seqs(\"ner\", tok_ner, MAXLEN_SRC)\n",
|
||||
"X_srl = seqs(\"srl\", tok_srl, MAXLEN_SRC)\n",
|
||||
"\n",
|
||||
"Q_in = seqs(\"question_in\", tok_q, MAXLEN_TGT)\n",
|
||||
"Q_out = seqs(\"question_out\", tok_q, MAXLEN_TGT)\n",
|
||||
"A_in = seqs(\"answer_in\", tok_a, MAXLEN_TGT)\n",
|
||||
"A_out = seqs(\"answer_out\", tok_a, MAXLEN_TGT)\n",
|
||||
"\n",
|
||||
"y_type = to_categorical(\n",
|
||||
" np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex[\"type\"] for ex in valid])]),\n",
|
||||
" num_classes=vocab_type\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Expand dims → (batch, seq, 1) agar cocok dgn sparse_cce\n",
|
||||
"Q_out = np.expand_dims(Q_out, -1)\n",
|
||||
"A_out = np.expand_dims(A_out, -1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "ad3fe7f2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"(X_tok_tr, X_tok_te,\n",
|
||||
" X_ner_tr, X_ner_te,\n",
|
||||
" X_srl_tr, X_srl_te,\n",
|
||||
" Q_in_tr, Q_in_te,\n",
|
||||
" Q_out_tr, Q_out_te,\n",
|
||||
" A_in_tr, A_in_te,\n",
|
||||
" A_out_tr, A_out_te,\n",
|
||||
" y_type_tr,y_type_te) = train_test_split(\n",
|
||||
" X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,\n",
|
||||
" test_size=0.2, random_state=42\n",
|
||||
" )\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f20abfb5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "ValueError",
|
||||
"evalue": "too many values to unpack (expected 3)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m emb_srl \u001b[38;5;241m=\u001b[39m Embedding(vocab_srl, \u001b[38;5;241m16\u001b[39m, mask_zero\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)(enc_srl)\n\u001b[1;32m 9\u001b[0m enc_cat \u001b[38;5;241m=\u001b[39m Concatenate()([emb_tok, emb_ner, emb_srl])\n\u001b[0;32m---> 10\u001b[0m enc_out, state_h, state_c \u001b[38;5;241m=\u001b[39m Bidirectional(\n\u001b[1;32m 11\u001b[0m LSTM(\u001b[38;5;241m256\u001b[39m, return_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_sequences\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 12\u001b[0m )(enc_cat)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ---------- Klasifikasi tipe ----------\u001b[39;00m\n\u001b[1;32m 15\u001b[0m type_out \u001b[38;5;241m=\u001b[39m Dense(vocab_type, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msoftmax\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype_output\u001b[39m\u001b[38;5;124m\"\u001b[39m)(enc_out)\n",
|
||||
"\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"enc_tok = Input(shape=(None,), name=\"enc_tok\")\n",
|
||||
"enc_ner = Input(shape=(None,), name=\"enc_ner\")\n",
|
||||
"enc_srl = Input(shape=(None,), name=\"enc_srl\")\n",
|
||||
"\n",
|
||||
"emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)\n",
|
||||
"emb_ner = Embedding(vocab_ner, 16, mask_zero=True)(enc_ner)\n",
|
||||
"emb_srl = Embedding(vocab_srl, 16, mask_zero=True)(enc_srl)\n",
|
||||
"\n",
|
||||
"enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
|
||||
"enc_out, state_h, state_c = Bidirectional(\n",
|
||||
" LSTM(256, return_state=True, return_sequences=False)\n",
|
||||
")(enc_cat)\n",
|
||||
"\n",
|
||||
"# ---------- Klasifikasi tipe ----------\n",
|
||||
"type_out = Dense(vocab_type, activation=\"softmax\", name=\"type_output\")(enc_out)\n",
|
||||
"\n",
|
||||
"# ---------- Decoder QUESTION ----------\n",
|
||||
"dec_q_in = Input(shape=(None,), name=\"dec_q_in\")\n",
|
||||
"dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)\n",
|
||||
"dec_q_lstm = LSTM(256, return_sequences=True)\n",
|
||||
"dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])\n",
|
||||
"q_out = TimeDistributed(Dense(vocab_q, activation=\"softmax\"), name=\"question_output\")(dec_q_out)\n",
|
||||
"\n",
|
||||
"# ---------- Decoder ANSWER ----------\n",
|
||||
"dec_a_in = Input(shape=(None,), name=\"dec_a_in\")\n",
|
||||
"dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)\n",
|
||||
"dec_a_lstm = LSTM(256, return_sequences=True)\n",
|
||||
"dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])\n",
|
||||
"a_out = TimeDistributed(Dense(vocab_a, activation=\"softmax\"), name=\"answer_output\")(dec_a_out)\n",
|
||||
"\n",
|
||||
"# ---------- Build & compile ----------\n",
|
||||
"model = Model(\n",
|
||||
" inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],\n",
|
||||
" outputs=[q_out, a_out, type_out]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model.compile(\n",
|
||||
" optimizer=\"adam\",\n",
|
||||
" loss={\n",
|
||||
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
|
||||
" \"answer_output\" : \"sparse_categorical_crossentropy\",\n",
|
||||
" \"type_output\" : \"categorical_crossentropy\"\n",
|
||||
" },\n",
|
||||
" loss_weights={\n",
|
||||
" \"question_output\": 1.0,\n",
|
||||
" \"answer_output\" : 1.0,\n",
|
||||
" \"type_output\" : 0.3\n",
|
||||
" },\n",
|
||||
" metrics={\n",
|
||||
" \"question_output\": \"accuracy\",\n",
|
||||
" \"answer_output\" : \"accuracy\",\n",
|
||||
" \"type_output\" : \"accuracy\"\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c348406e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"early = EarlyStopping(patience=3, restore_best_weights=True)\n",
|
||||
"\n",
|
||||
"model.fit(\n",
|
||||
" [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],\n",
|
||||
" {\"question_output\": Q_out_tr,\n",
|
||||
" \"answer_output\" : A_out_tr,\n",
|
||||
" \"type_output\" : y_type_tr},\n",
|
||||
" batch_size=BATCH,\n",
|
||||
" epochs=EPOCHS,\n",
|
||||
" validation_split=0.1,\n",
|
||||
" callbacks=[early]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# -------------------------------------------------\n",
|
||||
"# 8. Simpan model & tokenizer\n",
|
||||
"# -------------------------------------------------\n",
|
||||
"model.save(\"qg_multitask.keras\")\n",
|
||||
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
|
||||
" pickle.dump({\n",
|
||||
" \"token\": tok_token,\n",
|
||||
" \"ner\" : tok_ner,\n",
|
||||
" \"srl\" : tok_srl,\n",
|
||||
" \"q\" : tok_q,\n",
|
||||
" \"a\" : tok_a,\n",
|
||||
" \"type\" : tok_type\n",
|
||||
" }, f)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "myenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -13716,10 +13716,10 @@
|
|||
"type": "tof"
|
||||
},
|
||||
{
|
||||
"tokens": ["Indonesia", "terletak", "di", "Benua", "Afrika", "."],
|
||||
"tokens": ["Indonesia", "terletak", "di", "Benua", "asia", "."],
|
||||
"ner": ["B-LOC", "O", "O", "O", "B-LOC", "O"],
|
||||
"srl": ["ARG1", "V", "ARGM-LOC", "ARGM-LOC", "ARGM-LOC", "O"],
|
||||
"question": ["Indonesia", "terletak", "di", "Benua", "Afrika", "."],
|
||||
"question": ["Indonesia", "terletak", "di", "Benua", "asia", "."],
|
||||
"answer": ["false"],
|
||||
"type": "tof"
|
||||
}
|
|
@ -0,0 +1,703 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "9bf2159a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import numpy as np\n",
|
||||
"from pathlib import Path\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
||||
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
||||
"from tensorflow.keras.utils import to_categorical\n",
|
||||
"\n",
|
||||
"from tensorflow.keras.models import Model\n",
|
||||
"from tensorflow.keras.layers import (\n",
|
||||
" Input,\n",
|
||||
" Embedding,\n",
|
||||
" LSTM,\n",
|
||||
" Concatenate,\n",
|
||||
" Dense,\n",
|
||||
" TimeDistributed,\n",
|
||||
")\n",
|
||||
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
||||
"from sklearn.metrics import classification_report\n",
|
||||
"from collections import Counter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "50118278",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # Load raw data\n",
|
||||
"# with open(\"qg_dataset.json\", encoding=\"utf-8\") as f:\n",
|
||||
"# raw_data = json.load(f)\n",
|
||||
"\n",
|
||||
"# # Validasi lengkap\n",
|
||||
"# required_keys = {\"tokens\", \"ner\", \"srl\", \"question\", \"answer\", \"type\"}\n",
|
||||
"# valid_data = []\n",
|
||||
"# invalid_data = []\n",
|
||||
"\n",
|
||||
"# for idx, item in enumerate(raw_data):\n",
|
||||
"# error_messages = []\n",
|
||||
"\n",
|
||||
"# if not isinstance(item, dict):\n",
|
||||
"# error_messages.append(\"bukan dictionary\")\n",
|
||||
"\n",
|
||||
"# missing_keys = required_keys - item.keys()\n",
|
||||
"# if missing_keys:\n",
|
||||
"# error_messages.append(f\"missing keys: {missing_keys}\")\n",
|
||||
"\n",
|
||||
"# if not error_messages:\n",
|
||||
"# # Cek tipe data dan None\n",
|
||||
"# if (not isinstance(item[\"tokens\"], list) or\n",
|
||||
"# not isinstance(item[\"ner\"], list) or\n",
|
||||
"# not isinstance(item[\"srl\"], list) or\n",
|
||||
"# not isinstance(item[\"question\"], list) or\n",
|
||||
"# not isinstance(item[\"answer\"], list) or\n",
|
||||
"# not isinstance(item[\"type\"], str)):\n",
|
||||
"# error_messages.append(\"field type tidak sesuai\")\n",
|
||||
" \n",
|
||||
"# if error_messages:\n",
|
||||
"# print(f\"\\n Index {idx} | Masalah: {', '.join(error_messages)}\")\n",
|
||||
"# print(json.dumps(item, indent=2, ensure_ascii=False))\n",
|
||||
"# invalid_data.append(item)\n",
|
||||
"# continue\n",
|
||||
"\n",
|
||||
"# valid_data.append(item)\n",
|
||||
"\n",
|
||||
"# # Statistik\n",
|
||||
"# print(f\"\\n Jumlah data valid: {len(valid_data)} / {len(raw_data)}\")\n",
|
||||
"# print(f\" Jumlah data tidak valid: {len(invalid_data)}\")\n",
|
||||
"\n",
|
||||
"# # Proses data valid\n",
|
||||
"# tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in valid_data]\n",
|
||||
"# ner_tags = [item[\"ner\"] for item in valid_data]\n",
|
||||
"# srl_tags = [item[\"srl\"] for item in valid_data]\n",
|
||||
"# questions = [[token.lower().strip() for token in item[\"question\"]] for item in valid_data]\n",
|
||||
"# answers = [[token.lower().strip() for token in item[\"answer\"]] for item in valid_data]\n",
|
||||
"# types = [item[\"type\"] for item in valid_data]\n",
|
||||
"\n",
|
||||
"# type_counts = Counter(types)\n",
|
||||
"\n",
|
||||
"# print(type_counts)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "970867e2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Jumlah data valid: 396 / 397\n",
|
||||
"Jumlah data tidak valid: 1\n",
|
||||
"\n",
|
||||
"Distribusi Tipe Soal:\n",
|
||||
"- isian: 390\n",
|
||||
"- opsi: 4\n",
|
||||
"- true_false: 2\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"# Load raw data\n",
|
||||
"with open(\"../../dataset/dev_dataset_qg.json\", encoding=\"utf-8\") as f:\n",
|
||||
" raw_data = json.load(f)\n",
|
||||
"\n",
|
||||
"# Validasi lengkap\n",
|
||||
"required_keys = {\"tokens\", \"ner\", \"srl\", \"quiz_possibility\"}\n",
|
||||
"valid_data = []\n",
|
||||
"invalid_data = []\n",
|
||||
"\n",
|
||||
"for idx, item in enumerate(raw_data):\n",
|
||||
" error_messages = []\n",
|
||||
"\n",
|
||||
" if not isinstance(item, dict):\n",
|
||||
" error_messages.append(\"bukan dictionary\")\n",
|
||||
" invalid_data.append(item)\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" missing_keys = required_keys - item.keys()\n",
|
||||
" if missing_keys:\n",
|
||||
" error_messages.append(f\"missing keys: {missing_keys}\")\n",
|
||||
"\n",
|
||||
" if not error_messages:\n",
|
||||
" # Cek tipe data utama\n",
|
||||
" if (not isinstance(item[\"tokens\"], list) or\n",
|
||||
" not isinstance(item[\"ner\"], list) or\n",
|
||||
" not isinstance(item[\"srl\"], list) or\n",
|
||||
" not isinstance(item[\"quiz_possibility\"], list)):\n",
|
||||
" error_messages.append(\"field type tidak sesuai di level utama\")\n",
|
||||
"\n",
|
||||
" # Validasi quiz_possibility\n",
|
||||
" if not error_messages:\n",
|
||||
" if not item[\"quiz_possibility\"]:\n",
|
||||
" error_messages.append(\"quiz_possibility kosong\")\n",
|
||||
" else:\n",
|
||||
" quiz_item = item[\"quiz_possibility\"][0]\n",
|
||||
"\n",
|
||||
" # Validasi kunci di dalam quiz_possibility[0]\n",
|
||||
" expected_quiz_keys = {\"type\", \"question\", \"answer\"}\n",
|
||||
" missing_quiz_keys = expected_quiz_keys - quiz_item.keys()\n",
|
||||
"\n",
|
||||
" if missing_quiz_keys:\n",
|
||||
" error_messages.append(f\"missing keys di quiz_possibility[0]: {missing_quiz_keys}\")\n",
|
||||
" else:\n",
|
||||
" # Cek tipe data di quiz_possibility[0]\n",
|
||||
" if (not isinstance(quiz_item[\"type\"], str) or\n",
|
||||
" not isinstance(quiz_item[\"question\"], list) or\n",
|
||||
" not isinstance(quiz_item[\"answer\"], list)):\n",
|
||||
" error_messages.append(\"field type tidak sesuai di quiz_possibility[0]\")\n",
|
||||
" else:\n",
|
||||
" # Flatten ke struktur lama untuk konsistensi\n",
|
||||
" item[\"type\"] = quiz_item[\"type\"]\n",
|
||||
" item[\"question\"] = quiz_item[\"question\"]\n",
|
||||
" item[\"answer\"] = quiz_item[\"answer\"]\n",
|
||||
"\n",
|
||||
" if error_messages:\n",
|
||||
" print(f\"\\nIndex {idx} | Masalah: {', '.join(error_messages)}\")\n",
|
||||
" print(json.dumps(item, indent=2, ensure_ascii=False))\n",
|
||||
" invalid_data.append(item)\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" valid_data.append(item)\n",
|
||||
"\n",
|
||||
"# Statistik\n",
|
||||
"print(f\"\\nJumlah data valid: {len(valid_data)} / {len(raw_data)}\")\n",
|
||||
"print(f\"Jumlah data tidak valid: {len(invalid_data)}\")\n",
|
||||
"\n",
|
||||
"# Proses data valid\n",
|
||||
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in valid_data]\n",
|
||||
"ner_tags = [item[\"ner\"] for item in valid_data]\n",
|
||||
"srl_tags = [item[\"srl\"] for item in valid_data]\n",
|
||||
"questions = [[token.lower().strip() for token in item[\"question\"]] for item in valid_data]\n",
|
||||
"answers = [[token.lower().strip() for token in item[\"answer\"]] for item in valid_data]\n",
|
||||
"types = [item[\"type\"].lower().strip() for item in valid_data] # Konsistensi lowercase untuk tipe\n",
|
||||
"\n",
|
||||
"# Statistik tipe soal\n",
|
||||
"type_counts = Counter(types)\n",
|
||||
"print(\"\\nDistribusi Tipe Soal:\")\n",
|
||||
"for t, count in type_counts.items():\n",
|
||||
" print(f\"- {t}: {count}\")\n",
|
||||
"\n",
|
||||
"# (Opsional) Simpan data valid\n",
|
||||
"with open(\"cleaned_qg_dataset.json\", \"w\", encoding=\"utf-8\") as f:\n",
|
||||
" json.dump(valid_data, f, ensure_ascii=False, indent=2)\n",
|
||||
"\n",
|
||||
"# (Opsional) Simpan data tidak valid untuk analisa\n",
|
||||
"with open(\"invalid_qg_dataset.json\", \"w\", encoding=\"utf-8\") as f:\n",
|
||||
" json.dump(invalid_data, f, ensure_ascii=False, indent=2)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "4e3a0088",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# tokenize\n",
|
||||
"token_tok = Tokenizer(lower=False, oov_token=\"UNK\")\n",
|
||||
"token_ner = Tokenizer(lower=False)\n",
|
||||
"token_srl = Tokenizer(lower=False)\n",
|
||||
"token_q = Tokenizer(lower=False)\n",
|
||||
"token_a = Tokenizer(lower=False)\n",
|
||||
"token_type = Tokenizer(lower=False)\n",
|
||||
"\n",
|
||||
"token_tok.fit_on_texts(tokens)\n",
|
||||
"token_ner.fit_on_texts(ner_tags)\n",
|
||||
"token_srl.fit_on_texts(srl_tags)\n",
|
||||
"token_q.fit_on_texts(questions)\n",
|
||||
"token_a.fit_on_texts(answers)\n",
|
||||
"token_type.fit_on_texts(types)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"maxlen = 20"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "555f9e22",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'opsi', 'isian', 'true_false'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"X_tok = pad_sequences(\n",
|
||||
" token_tok.texts_to_sequences(tokens), padding=\"post\", maxlen=maxlen\n",
|
||||
")\n",
|
||||
"X_ner = pad_sequences(\n",
|
||||
" token_ner.texts_to_sequences(ner_tags), padding=\"post\", maxlen=maxlen\n",
|
||||
")\n",
|
||||
"X_srl = pad_sequences(\n",
|
||||
" token_srl.texts_to_sequences(srl_tags), padding=\"post\", maxlen=maxlen\n",
|
||||
")\n",
|
||||
"y_q = pad_sequences(token_q.texts_to_sequences(questions), padding=\"post\", maxlen=maxlen)\n",
|
||||
"y_a = pad_sequences(token_a.texts_to_sequences(answers), padding=\"post\", maxlen=maxlen)\n",
|
||||
"\n",
|
||||
"print(set(types))\n",
|
||||
"\n",
|
||||
"y_type = [seq[0] for seq in token_type.texts_to_sequences(types)] # list of int\n",
|
||||
"y_type = to_categorical(np.array(y_type) - 1, num_classes=len(token_type.word_index))\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "f530cfe7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_tok_train, X_tok_test, X_ner_train, X_ner_test, X_srl_train, X_srl_test, \\\n",
|
||||
"y_q_train, y_q_test, y_a_train, y_a_test, y_type_train, y_type_test = train_test_split(\n",
|
||||
" X_tok, X_ner, X_srl, y_q, y_a, y_type, test_size=0.2, random_state=42\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"X_train = [X_tok_train, X_ner_train, X_srl_train]\n",
|
||||
"X_test = [X_tok_test, X_ner_test, X_srl_test]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "255e2a9a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_1\"</span>\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1mModel: \"functional_1\"\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ tok_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_input │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_3 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">116,992</span> │ tok_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_4 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">704</span> │ ner_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_5 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">336</span> │ srl_input[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">160</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ embedding_3[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>) │ │ │ embedding_4[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
||||
"│ │ │ │ embedding_5[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm_1 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">427,008</span> │ concatenate_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ get_item_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ lstm_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">GetItem</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ question_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">479</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">123,103</span> │ lstm_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ answer_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">308</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">79,156</span> │ lstm_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ type_output (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">4</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,028</span> │ get_item_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ tok_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_input │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_3 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m116,992\u001b[0m │ tok_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m704\u001b[0m │ ner_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_5 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m16\u001b[0m) │ \u001b[38;5;34m336\u001b[0m │ srl_input[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ concatenate_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m160\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ embedding_3[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
|
||||
"│ (\u001b[38;5;33mConcatenate\u001b[0m) │ │ │ embedding_4[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
|
||||
"│ │ │ │ embedding_5[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ lstm_1 (\u001b[38;5;33mLSTM\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m427,008\u001b[0m │ concatenate_1[\u001b[38;5;34m0\u001b[0m]… │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ get_item_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mGetItem\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ question_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m479\u001b[0m) │ \u001b[38;5;34m123,103\u001b[0m │ lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ answer_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m308\u001b[0m) │ \u001b[38;5;34m79,156\u001b[0m │ lstm_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ type_output (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m4\u001b[0m) │ \u001b[38;5;34m1,028\u001b[0m │ get_item_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">748,327</span> (2.85 MB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m748,327\u001b[0m (2.85 MB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">748,327</span> (2.85 MB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m748,327\u001b[0m (2.85 MB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 1/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 176ms/step - answer_output_accuracy: 0.4544 - answer_output_loss: 5.6455 - loss: 13.1436 - question_output_accuracy: 0.3565 - question_output_loss: 6.1017 - type_output_accuracy: 0.6386 - type_output_loss: 1.3766 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 5.0547 - val_loss: 12.0109 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 5.6110 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.3453\n",
|
||||
"Epoch 2/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9145 - answer_output_loss: 4.3849 - loss: 10.8584 - question_output_accuracy: 0.6760 - question_output_loss: 5.0255 - type_output_accuracy: 0.9758 - type_output_loss: 1.3371 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 2.1055 - val_loss: 6.1782 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.7704 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.3023\n",
|
||||
"Epoch 3/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9095 - answer_output_loss: 1.7129 - loss: 5.4664 - question_output_accuracy: 0.6777 - question_output_loss: 2.4346 - type_output_accuracy: 0.9795 - type_output_loss: 1.2889 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 1.0023 - val_loss: 4.2358 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.0019 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.2316\n",
|
||||
"Epoch 4/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9140 - answer_output_loss: 0.9210 - loss: 4.2240 - question_output_accuracy: 0.6804 - question_output_loss: 2.1028 - type_output_accuracy: 0.9812 - type_output_loss: 1.2037 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.7526 - val_loss: 4.0127 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.1652 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 1.0949\n",
|
||||
"Epoch 5/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9131 - answer_output_loss: 0.7388 - loss: 4.0409 - question_output_accuracy: 0.6753 - question_output_loss: 2.2497 - type_output_accuracy: 0.9832 - type_output_loss: 1.0455 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6789 - val_loss: 3.6821 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.1028 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.9003\n",
|
||||
"Epoch 6/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9190 - answer_output_loss: 0.6585 - loss: 3.5809 - question_output_accuracy: 0.6788 - question_output_loss: 2.0865 - type_output_accuracy: 0.9797 - type_output_loss: 0.8341 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6491 - val_loss: 3.3418 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.0148 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.6779\n",
|
||||
"Epoch 7/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9165 - answer_output_loss: 0.6312 - loss: 3.2776 - question_output_accuracy: 0.6763 - question_output_loss: 2.0259 - type_output_accuracy: 0.9695 - type_output_loss: 0.6233 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6313 - val_loss: 3.1431 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 2.0432 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.4687\n",
|
||||
"Epoch 8/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 53ms/step - answer_output_accuracy: 0.9148 - answer_output_loss: 0.6209 - loss: 3.0631 - question_output_accuracy: 0.6762 - question_output_loss: 2.0136 - type_output_accuracy: 0.9708 - type_output_loss: 0.4301 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6193 - val_loss: 2.9071 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 1.9849 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.3029\n",
|
||||
"Epoch 9/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9155 - answer_output_loss: 0.6067 - loss: 2.7923 - question_output_accuracy: 0.6799 - question_output_loss: 1.9057 - type_output_accuracy: 0.9747 - type_output_loss: 0.2789 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6109 - val_loss: 2.7805 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 1.9768 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.1928\n",
|
||||
"Epoch 10/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9160 - answer_output_loss: 0.5715 - loss: 2.6738 - question_output_accuracy: 0.6770 - question_output_loss: 1.9091 - type_output_accuracy: 0.9784 - type_output_loss: 0.1873 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.6033 - val_loss: 2.6801 - val_question_output_accuracy: 0.6844 - val_question_output_loss: 1.9506 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.1262\n",
|
||||
"Epoch 11/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9159 - answer_output_loss: 0.5691 - loss: 2.5854 - question_output_accuracy: 0.6791 - question_output_loss: 1.8621 - type_output_accuracy: 0.9743 - type_output_loss: 0.1495 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5962 - val_loss: 2.5971 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.9119 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0890\n",
|
||||
"Epoch 12/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9151 - answer_output_loss: 0.5528 - loss: 2.4857 - question_output_accuracy: 0.6954 - question_output_loss: 1.8064 - type_output_accuracy: 0.9765 - type_output_loss: 0.1240 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5907 - val_loss: 2.5231 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.8654 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0670\n",
|
||||
"Epoch 13/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9116 - answer_output_loss: 0.5741 - loss: 2.4910 - question_output_accuracy: 0.6913 - question_output_loss: 1.7912 - type_output_accuracy: 0.9721 - type_output_loss: 0.1279 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5874 - val_loss: 2.4624 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.8207 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0543\n",
|
||||
"Epoch 14/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 59ms/step - answer_output_accuracy: 0.9142 - answer_output_loss: 0.5370 - loss: 2.4278 - question_output_accuracy: 0.6900 - question_output_loss: 1.7686 - type_output_accuracy: 0.9730 - type_output_loss: 0.1186 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5837 - val_loss: 2.4136 - val_question_output_accuracy: 0.7031 - val_question_output_loss: 1.7833 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0466\n",
|
||||
"Epoch 15/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9160 - answer_output_loss: 0.5186 - loss: 2.3183 - question_output_accuracy: 0.6898 - question_output_loss: 1.7028 - type_output_accuracy: 0.9784 - type_output_loss: 0.1001 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5794 - val_loss: 2.3714 - val_question_output_accuracy: 0.7109 - val_question_output_loss: 1.7506 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0414\n",
|
||||
"Epoch 16/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9171 - answer_output_loss: 0.5077 - loss: 2.2275 - question_output_accuracy: 0.7036 - question_output_loss: 1.6393 - type_output_accuracy: 0.9791 - type_output_loss: 0.0876 - val_answer_output_accuracy: 0.9141 - val_answer_output_loss: 0.5748 - val_loss: 2.3340 - val_question_output_accuracy: 0.7172 - val_question_output_loss: 1.7214 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0379\n",
|
||||
"Epoch 17/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 55ms/step - answer_output_accuracy: 0.9137 - answer_output_loss: 0.5248 - loss: 2.2290 - question_output_accuracy: 0.7070 - question_output_loss: 1.6285 - type_output_accuracy: 0.9828 - type_output_loss: 0.0771 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5716 - val_loss: 2.3017 - val_question_output_accuracy: 0.7172 - val_question_output_loss: 1.6946 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0355\n",
|
||||
"Epoch 18/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9233 - answer_output_loss: 0.5080 - loss: 2.2392 - question_output_accuracy: 0.7059 - question_output_loss: 1.6139 - type_output_accuracy: 0.9678 - type_output_loss: 0.1205 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5676 - val_loss: 2.2777 - val_question_output_accuracy: 0.7219 - val_question_output_loss: 1.6760 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0341\n",
|
||||
"Epoch 19/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 53ms/step - answer_output_accuracy: 0.9221 - answer_output_loss: 0.5038 - loss: 2.1188 - question_output_accuracy: 0.7131 - question_output_loss: 1.5706 - type_output_accuracy: 0.9854 - type_output_loss: 0.0616 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5639 - val_loss: 2.2545 - val_question_output_accuracy: 0.7203 - val_question_output_loss: 1.6580 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0326\n",
|
||||
"Epoch 20/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9175 - answer_output_loss: 0.5233 - loss: 2.1645 - question_output_accuracy: 0.7128 - question_output_loss: 1.5526 - type_output_accuracy: 0.9775 - type_output_loss: 0.0858 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5603 - val_loss: 2.2376 - val_question_output_accuracy: 0.7234 - val_question_output_loss: 1.6450 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0323\n",
|
||||
"Epoch 21/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9193 - answer_output_loss: 0.5090 - loss: 2.1288 - question_output_accuracy: 0.7118 - question_output_loss: 1.5447 - type_output_accuracy: 0.9828 - type_output_loss: 0.0644 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5568 - val_loss: 2.2206 - val_question_output_accuracy: 0.7219 - val_question_output_loss: 1.6317 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0321\n",
|
||||
"Epoch 22/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 54ms/step - answer_output_accuracy: 0.9204 - answer_output_loss: 0.4971 - loss: 2.0726 - question_output_accuracy: 0.7128 - question_output_loss: 1.5100 - type_output_accuracy: 0.9817 - type_output_loss: 0.0626 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5535 - val_loss: 2.2055 - val_question_output_accuracy: 0.7359 - val_question_output_loss: 1.6200 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0320\n",
|
||||
"Epoch 23/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9191 - answer_output_loss: 0.5003 - loss: 2.1218 - question_output_accuracy: 0.7108 - question_output_loss: 1.5310 - type_output_accuracy: 0.9762 - type_output_loss: 0.0771 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5517 - val_loss: 2.1920 - val_question_output_accuracy: 0.7234 - val_question_output_loss: 1.6081 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0322\n",
|
||||
"Epoch 24/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 53ms/step - answer_output_accuracy: 0.9220 - answer_output_loss: 0.4808 - loss: 2.0044 - question_output_accuracy: 0.7175 - question_output_loss: 1.4722 - type_output_accuracy: 0.9810 - type_output_loss: 0.0608 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5494 - val_loss: 2.1723 - val_question_output_accuracy: 0.7312 - val_question_output_loss: 1.5905 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0323\n",
|
||||
"Epoch 25/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 58ms/step - answer_output_accuracy: 0.9183 - answer_output_loss: 0.4965 - loss: 2.0500 - question_output_accuracy: 0.7174 - question_output_loss: 1.4835 - type_output_accuracy: 0.9775 - type_output_loss: 0.0676 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5473 - val_loss: 2.1609 - val_question_output_accuracy: 0.7328 - val_question_output_loss: 1.5810 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0326\n",
|
||||
"Epoch 26/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9236 - answer_output_loss: 0.4672 - loss: 1.9620 - question_output_accuracy: 0.7220 - question_output_loss: 1.4313 - type_output_accuracy: 0.9780 - type_output_loss: 0.0672 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5454 - val_loss: 2.1488 - val_question_output_accuracy: 0.7344 - val_question_output_loss: 1.5705 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0328\n",
|
||||
"Epoch 27/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 70ms/step - answer_output_accuracy: 0.9219 - answer_output_loss: 0.4671 - loss: 1.9415 - question_output_accuracy: 0.7288 - question_output_loss: 1.4130 - type_output_accuracy: 0.9765 - type_output_loss: 0.0605 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5440 - val_loss: 2.1382 - val_question_output_accuracy: 0.7359 - val_question_output_loss: 1.5615 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0327\n",
|
||||
"Epoch 28/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 52ms/step - answer_output_accuracy: 0.9212 - answer_output_loss: 0.4676 - loss: 1.9277 - question_output_accuracy: 0.7271 - question_output_loss: 1.4106 - type_output_accuracy: 0.9823 - type_output_loss: 0.0526 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5435 - val_loss: 2.1317 - val_question_output_accuracy: 0.7422 - val_question_output_loss: 1.5559 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0323\n",
|
||||
"Epoch 29/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 57ms/step - answer_output_accuracy: 0.9228 - answer_output_loss: 0.4658 - loss: 1.8773 - question_output_accuracy: 0.7397 - question_output_loss: 1.3683 - type_output_accuracy: 0.9823 - type_output_loss: 0.0487 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5428 - val_loss: 2.1239 - val_question_output_accuracy: 0.7437 - val_question_output_loss: 1.5493 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0319\n",
|
||||
"Epoch 30/30\n",
|
||||
"\u001b[1m5/5\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 56ms/step - answer_output_accuracy: 0.9207 - answer_output_loss: 0.4658 - loss: 1.9146 - question_output_accuracy: 0.7355 - question_output_loss: 1.3799 - type_output_accuracy: 0.9795 - type_output_loss: 0.0563 - val_answer_output_accuracy: 0.9219 - val_answer_output_loss: 0.5421 - val_loss: 2.1174 - val_question_output_accuracy: 0.7437 - val_question_output_loss: 1.5436 - val_type_output_accuracy: 1.0000 - val_type_output_loss: 0.0317\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"inp_tok = Input(shape=(None,), name=\"tok_input\")\n",
|
||||
"inp_ner = Input(shape=(None,), name=\"ner_input\")\n",
|
||||
"inp_srl = Input(shape=(None,), name=\"srl_input\")\n",
|
||||
"\n",
|
||||
"emb_tok = Embedding(input_dim=len(token_tok.word_index) + 1, output_dim=128)(inp_tok)\n",
|
||||
"emb_ner = Embedding(input_dim=len(token_ner.word_index) + 1, output_dim=16)(inp_ner)\n",
|
||||
"emb_srl = Embedding(input_dim=len(token_srl.word_index) + 1, output_dim=16)(inp_srl)\n",
|
||||
"\n",
|
||||
"# emb_tok = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_tok)\n",
|
||||
"# emb_ner = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_ner)\n",
|
||||
"# emb_srl = Embedding(input_dim=..., output_dim=..., mask_zero=True)(inp_srl)\n",
|
||||
"\n",
|
||||
"merged = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
|
||||
"\n",
|
||||
"x = LSTM(256, return_sequences=True)(merged)\n",
|
||||
"\n",
|
||||
"out_question = TimeDistributed(Dense(len(token_q.word_index) + 1, activation=\"softmax\"), name=\"question_output\")(x)\n",
|
||||
"out_answer = TimeDistributed(Dense(len(token_a.word_index) + 1, activation=\"softmax\"), name=\"answer_output\")(x)\n",
|
||||
"out_type = Dense(len(token_type.word_index), activation=\"softmax\", name=\"type_output\")(\n",
|
||||
" x[:, 0, :]\n",
|
||||
") # gunakan step pertama\n",
|
||||
"\n",
|
||||
"model = Model(\n",
|
||||
" inputs=[inp_tok, inp_ner, inp_srl], outputs=[out_question, out_answer, out_type]\n",
|
||||
")\n",
|
||||
"model.compile(\n",
|
||||
" optimizer=\"adam\",\n",
|
||||
" loss={\n",
|
||||
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
|
||||
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
|
||||
" \"type_output\": \"categorical_crossentropy\",\n",
|
||||
" },\n",
|
||||
" metrics={\n",
|
||||
" \"question_output\": \"accuracy\",\n",
|
||||
" \"answer_output\": \"accuracy\",\n",
|
||||
" \"type_output\": \"accuracy\",\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model.summary()\n",
|
||||
"\n",
|
||||
"# ----------------------------------------------------------------------------\n",
|
||||
"# 5. TRAINING\n",
|
||||
"# ----------------------------------------------------------------------------\n",
|
||||
"model.fit(\n",
|
||||
" X_train,\n",
|
||||
" {\n",
|
||||
" \"question_output\": np.expand_dims(y_q_train, -1),\n",
|
||||
" \"answer_output\": np.expand_dims(y_a_train, -1),\n",
|
||||
" \"type_output\": y_type_train,\n",
|
||||
" },\n",
|
||||
" batch_size=64,\n",
|
||||
" epochs=30,\n",
|
||||
" validation_split=0.1,\n",
|
||||
" callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"import pickle\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model.save(\"new_model_lstm_qg.keras\")\n",
|
||||
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
|
||||
" pickle.dump({\n",
|
||||
" \"token\": token_tok,\n",
|
||||
" \"ner\": token_ner,\n",
|
||||
" \"srl\": token_srl,\n",
|
||||
" \"question\": token_q,\n",
|
||||
" \"answer\": token_a,\n",
|
||||
" \"type\": token_type\n",
|
||||
" }, f)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "06fd86c7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[1m3/3\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 137ms/step\n",
|
||||
"\n",
|
||||
"=== Akurasi Detail ===\n",
|
||||
"Question Accuracy (Token-level): 0.1519\n",
|
||||
"Answer Accuracy (Token-level) : 0.0638\n",
|
||||
"Type Accuracy (Class-level) : 1.00\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"def token_level_accuracy(y_true, y_pred):\n",
|
||||
" correct = 0\n",
|
||||
" total = 0\n",
|
||||
" for true_seq, pred_seq in zip(y_true, y_pred):\n",
|
||||
" for t, p in zip(true_seq, pred_seq):\n",
|
||||
" if t != 0: # ignore padding\n",
|
||||
" total += 1\n",
|
||||
" if t == p:\n",
|
||||
" correct += 1\n",
|
||||
" return correct / total if total > 0 else 0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Predict on test set\n",
|
||||
"y_pred_q, y_pred_a, y_pred_type = model.predict(X_test)\n",
|
||||
"\n",
|
||||
"# Decode predictions to class indices\n",
|
||||
"y_pred_q = np.argmax(y_pred_q, axis=-1)\n",
|
||||
"y_pred_a = np.argmax(y_pred_a, axis=-1)\n",
|
||||
"y_pred_type = np.argmax(y_pred_type, axis=-1)\n",
|
||||
"y_true_type = np.argmax(y_type_test, axis=-1)\n",
|
||||
"\n",
|
||||
"# Calculate token-level accuracy\n",
|
||||
"acc_q = token_level_accuracy(y_q_test, y_pred_q)\n",
|
||||
"acc_a = token_level_accuracy(y_a_test, y_pred_a)\n",
|
||||
"\n",
|
||||
"# Type classification report\n",
|
||||
"report_type = classification_report(y_true_type, y_pred_type, zero_division=0)\n",
|
||||
"\n",
|
||||
"# Print Results\n",
|
||||
"print(\"\\n=== Akurasi Detail ===\")\n",
|
||||
"print(f\"Question Accuracy (Token-level): {acc_q:.4f}\")\n",
|
||||
"print(f\"Answer Accuracy (Token-level) : {acc_a:.4f}\")\n",
|
||||
"print(f\"Type Accuracy (Class-level) : {np.mean(y_true_type == y_pred_type):.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "b17b6470",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import sacrebleu\n",
|
||||
"# from sacrebleu.metrics import BLEU # optional kalau mau smoothing/effective_order\n",
|
||||
"\n",
|
||||
"# idx2tok = {v:k for k,v in word2idx.items()}\n",
|
||||
"# PAD_ID = word2idx[\"PAD\"]\n",
|
||||
"# SOS_ID = word2idx.get(\"SOS\", None)\n",
|
||||
"# EOS_ID = word2idx.get(\"EOS\", None)\n",
|
||||
"\n",
|
||||
"# def seq2str(seq):\n",
|
||||
"# \"\"\"Konversi list index -> kalimat string, sambil buang token spesial.\"\"\"\n",
|
||||
"# toks = [idx2tok[i] for i in seq\n",
|
||||
"# if i not in {PAD_ID, SOS_ID, EOS_ID}]\n",
|
||||
"# return \" \".join(toks).strip().lower()\n",
|
||||
"\n",
|
||||
"# bleu_metric = BLEU(effective_order=True) # lebih stabil utk kalimat pendek\n",
|
||||
"\n",
|
||||
"# def bleu_corpus(pred_seqs, true_seqs):\n",
|
||||
"# preds = [seq2str(p) for p in pred_seqs]\n",
|
||||
"# refs = [[seq2str(t)] for t in true_seqs] # list‑of‑list, satu ref/kalimat\n",
|
||||
"# return bleu_metric.corpus_score(preds, refs).score\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "d5ed106c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# flat_true_a, flat_pred_a = flatten_valid(y_a_test, y_pred_a_class)\n",
|
||||
"# print(\"\\n=== Classification Report: ANSWER ===\")\n",
|
||||
"# print(classification_report(flat_true_a, flat_pred_a))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "aa3860de",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# print(\"\\n=== Classification Report: TYPE ===\")\n",
|
||||
"# print(classification_report(y_true_type_class, y_pred_type_class))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "myenv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -6,10 +6,10 @@ import numpy as np
|
|||
|
||||
def infer_from_input(input_data, maxlen=50):
|
||||
|
||||
with open("QC/tokenizers.pkl", "rb") as f:
|
||||
with open("tokenizers.pkl", "rb") as f:
|
||||
tokenizers = pickle.load(f)
|
||||
|
||||
model = load_model("QC/new_model_lstm_qg.keras")
|
||||
model = load_model("new_model_lstm_qg.keras")
|
||||
|
||||
tok_token = tokenizers["token"]
|
||||
tok_ner = tokenizers["ner"]
|
||||
|
@ -63,42 +63,34 @@ if __name__ == "__main__":
|
|||
# Example input
|
||||
input_data = {
|
||||
"tokens": [
|
||||
"Ki",
|
||||
"Hajar",
|
||||
"Dewantara",
|
||||
"lahir",
|
||||
"pada",
|
||||
"2",
|
||||
"Mei",
|
||||
"1889",
|
||||
"di",
|
||||
"Yogyakarta",
|
||||
"Mars",
|
||||
"disebut",
|
||||
"juga",
|
||||
"sebagai",
|
||||
"planet",
|
||||
"merah",
|
||||
"karena",
|
||||
"permukaannya",
|
||||
"banyak",
|
||||
"mengandung",
|
||||
"zat",
|
||||
"besi",
|
||||
".",
|
||||
],
|
||||
"ner": [
|
||||
"B-PER",
|
||||
"I-PER",
|
||||
"I-PER",
|
||||
"O",
|
||||
"O",
|
||||
"B-DATE",
|
||||
"I-DATE",
|
||||
"I-DATE",
|
||||
"O",
|
||||
"B-LOC",
|
||||
"O",
|
||||
],
|
||||
"ner": ["B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"O",
|
||||
"ARGM-LOC",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"ARGM-CAU",
|
||||
"ARG1",
|
||||
"ARGM-MNR",
|
||||
"ARGM-MNR",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"O",
|
||||
],
|
||||
}
|
|
@ -0,0 +1,332 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Attention,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
import string
|
||||
from collections import Counter
|
||||
|
||||
|
||||
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model
|
||||
def prepare_data(data):
|
||||
"""Siapkan data untuk model"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
answers = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
answers.append(qa["answer"])
|
||||
q_types.append(qa["type"])
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
|
||||
|
||||
|
||||
# Siapkan data
|
||||
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
|
||||
data
|
||||
)
|
||||
|
||||
# Tokenizer untuk teks (context dan question)
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
return context_padded, question_padded, token_padded, ner_padded, srl_padded
|
||||
|
||||
|
||||
# Siapkan encoder untuk jawaban
|
||||
answer_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
answer_tokenizer.fit_on_texts(answers)
|
||||
answer_vocab_size = len(answer_tokenizer.word_index) + 1
|
||||
|
||||
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
|
||||
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
|
||||
)
|
||||
|
||||
# Encode jawaban
|
||||
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
|
||||
max_answer_len = max([len(seq) for seq in answer_seqs])
|
||||
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_answer = get_subset(answer_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_answer = get_subset(answer_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model
|
||||
def create_qa_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
question_input = Input(shape=(max_question_len,), name="question_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
question_embed = text_embedding(question_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
question_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="question_lstm")
|
||||
)(question_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
# Attention mechanism untuk context dengan memperhatikan question
|
||||
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
||||
[context_lstm, question_lstm]
|
||||
)
|
||||
|
||||
# Pool attention outputs
|
||||
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
||||
context_attention
|
||||
)
|
||||
question_pool = tf.keras.layers.GlobalMaxPooling1D(name="question_pool")(
|
||||
question_lstm
|
||||
)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features
|
||||
all_features = Concatenate(name="all_features")(
|
||||
[context_att_pool, question_pool, token_pool, q_type_input]
|
||||
)
|
||||
|
||||
# Dense layers
|
||||
x = Dense(256, activation="relu", name="dense_1")(all_features)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
x = Dense(128, activation="relu", name="dense_2")(x)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
|
||||
# Output layer untuk jawaban
|
||||
answer_output = Dense(
|
||||
answer_vocab_size, activation="softmax", name="answer_output"
|
||||
)(x)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
question_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
q_type_input,
|
||||
],
|
||||
outputs=answer_output,
|
||||
)
|
||||
|
||||
# Compile model
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Buat model
|
||||
model = create_qa_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"qa_lstm_model.h5", monitor="val_accuracy", save_best_only=True, verbose=1
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
|
||||
|
||||
# Training
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Ubah format jawaban untuk sparse categorical crossentropy
|
||||
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
|
||||
test_answer_labels = test_answer[:, 0]
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_question, train_token, train_ner, train_srl, train_q_type],
|
||||
train_answer_labels,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_question, test_token, test_ner, test_srl, test_q_type],
|
||||
test_answer_labels,
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
|
||||
# Simpan model dan tokenizer
|
||||
model.save("qa_lstm_model_final.h5")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"answer_tokenizer": answer_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("qa_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
print("Model dan tokenizer berhasil disimpan!")
|
|
@ -0,0 +1,161 @@
|
|||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.models import load_model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
||||
import re
|
||||
import random
|
||||
|
||||
# Load tokenizers and model configurations
|
||||
with open("qa_tokenizers.json", "r") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
|
||||
ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
|
||||
srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
|
||||
answer_tokenizer = tokenizer_from_json(tokenizer_data["answer_tokenizer"])
|
||||
q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
|
||||
|
||||
max_context_len = tokenizer_data["max_context_len"]
|
||||
max_question_len = tokenizer_data["max_question_len"]
|
||||
max_token_len = tokenizer_data["max_token_len"]
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
# Load trained model
|
||||
model = load_model("qa_lstm_model_final.keras")
|
||||
|
||||
|
||||
def preprocess_text(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def predict_answer(context, question, tokens, ner, srl, q_type):
|
||||
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
|
||||
question_seq = tokenizer.texts_to_sequences([preprocess_text(question)])
|
||||
token_seq = [tokenizer.texts_to_sequences([" ".join(tokens)])[0]]
|
||||
ner_seq = [ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]]
|
||||
srl_seq = [srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]]
|
||||
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_cat = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_pad = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
|
||||
question_pad = pad_sequences(question_seq, maxlen=max_question_len, padding="post")
|
||||
token_pad = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
|
||||
ner_pad = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
|
||||
srl_pad = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
# Predict
|
||||
prediction = model.predict(
|
||||
[context_pad, question_pad, token_pad, ner_pad, srl_pad, q_type_cat], verbose=0
|
||||
)
|
||||
answer_idx = np.argmax(prediction[0])
|
||||
|
||||
# Retrieve predicted answer word
|
||||
for word, idx in answer_tokenizer.word_index.items():
|
||||
if idx == answer_idx:
|
||||
return word
|
||||
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def generate_question_answer(context, tokens, ner, srl, question_type="isian"):
|
||||
entities = {}
|
||||
predicate = ""
|
||||
|
||||
for i, token in enumerate(tokens):
|
||||
if ner[i] != "O":
|
||||
entities.setdefault(ner[i], []).append(token)
|
||||
if srl[i] == "V":
|
||||
predicate = token
|
||||
elif srl[i].startswith("ARG"):
|
||||
entities.setdefault(srl[i], []).append(token)
|
||||
|
||||
subject = " ".join(entities.get("ARG0", [""]))
|
||||
|
||||
if question_type == "isian":
|
||||
if "LOC" in entities:
|
||||
location = " ".join(entities["LOC"])
|
||||
return f"Dimana {subject} {predicate} ___", location
|
||||
elif "DATE" in entities:
|
||||
date = " ".join(entities["DATE"])
|
||||
return f"Kapan {subject} {predicate} ___", date
|
||||
|
||||
elif question_type == "true_false":
|
||||
if "DATE" in entities:
|
||||
original_date = " ".join(entities["DATE"])
|
||||
try:
|
||||
modified_year = str(int(entities["DATE"][-1]) + random.randint(1, 5))
|
||||
modified_date = (
|
||||
f"{entities['DATE'][0]} {entities['DATE'][1]} {modified_year}"
|
||||
)
|
||||
except:
|
||||
modified_date = original_date # Fallback if parsing fails
|
||||
return f"{subject} {predicate} pada {modified_date} ___", "false"
|
||||
|
||||
elif question_type == "opsi":
|
||||
if "LOC" in entities:
|
||||
correct_location = " ".join(entities["LOC"])
|
||||
distractors = ["singasari", "kuta", "banten", "kediri", "makassar"]
|
||||
distractors = [d for d in distractors if d != correct_location]
|
||||
options = random.sample(distractors, 3) + [correct_location]
|
||||
random.shuffle(options)
|
||||
return f"Dimana {subject} {predicate} ___", options, correct_location
|
||||
|
||||
return "Apa yang terjadi dalam teks ini ___", context
|
||||
|
||||
|
||||
# ✅ Example Usage with Random Sampling
|
||||
if __name__ == "__main__":
|
||||
with open("../dataset/stable_qg_qa_train_dataset.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Randomly select an example for testing
|
||||
test_item = random.choice(data)
|
||||
test_qa = random.choice(test_item["qas"])
|
||||
|
||||
predicted_answer = predict_answer(
|
||||
test_item["context"],
|
||||
test_qa["question"],
|
||||
test_item["tokens"],
|
||||
test_item["ner"],
|
||||
test_item["srl"],
|
||||
test_qa["type"],
|
||||
)
|
||||
|
||||
print(f"Context: {test_item['context']}")
|
||||
print(f"Question: {test_qa['question']}")
|
||||
print(f"True Answer: {test_qa['answer']}")
|
||||
print(f"Predicted Answer: {predicted_answer}")
|
||||
|
||||
# Generate Random Question Example
|
||||
example_context = test_item["context"]
|
||||
example_tokens = test_item["tokens"]
|
||||
example_ner = test_item["ner"]
|
||||
example_srl = test_item["srl"]
|
||||
|
||||
random_question_type = random.choice(["isian", "true_false", "opsi"])
|
||||
|
||||
result = generate_question_answer(
|
||||
example_context, example_tokens, example_ner, example_srl, random_question_type
|
||||
)
|
||||
|
||||
print("\nGenerated Question Example:")
|
||||
print(f"Context: {example_context}")
|
||||
print(f"Question Type: {random_question_type}")
|
||||
|
||||
if random_question_type == "opsi":
|
||||
question, options, correct_answer = result
|
||||
print(f"Generated Question: {question}")
|
||||
print(f"Options: {options}")
|
||||
print(f"Correct Answer: {correct_answer}")
|
||||
else:
|
||||
question, answer = result
|
||||
print(f"Generated Question: {question}")
|
||||
print(f"Answer: {answer}")
|
|
@ -0,0 +1,54 @@
|
|||
import json
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
def normalize_question(text):
|
||||
text = re.sub(r'\s+([?.!,])', r'\1', text)
|
||||
return text.capitalize()
|
||||
|
||||
# Load data
|
||||
with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
|
||||
processed_data = []
|
||||
|
||||
for idx_entry, entry in enumerate(data):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
|
||||
if "context" not in entry:
|
||||
entry["context"] = " ".join(entry.get("tokens", []))
|
||||
|
||||
# Update NER tags: ubah 'V' menjadi 'O'
|
||||
ner_tags = entry.get("ner", [])
|
||||
entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]
|
||||
|
||||
for idx_qa, qa in enumerate(entry.get("qas", [])):
|
||||
if "id" not in qa:
|
||||
qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"
|
||||
|
||||
answer = qa.get("answer")
|
||||
if isinstance(answer, list):
|
||||
qa["answer"] = " ".join(answer)
|
||||
|
||||
question = qa.get("question")
|
||||
if isinstance(question, list):
|
||||
question_str = " ".join(question)
|
||||
qa["question"] = normalize_question(question_str)
|
||||
|
||||
# Reorder fields: tokens first, then the rest
|
||||
ordered_entry = OrderedDict()
|
||||
if "context" in entry:
|
||||
ordered_entry["context"] = entry.pop("context")
|
||||
# Add remaining fields in their original order
|
||||
for key, value in entry.items():
|
||||
ordered_entry[key] = value
|
||||
|
||||
processed_data.append(ordered_entry)
|
||||
|
||||
# Save result
|
||||
with open('data_converted.json', 'w', encoding='utf-8') as file:
|
||||
json.dump(processed_data, file, indent=2, ensure_ascii=False)
|
||||
|
||||
# Optional: Print first 2 entries for quick verification
|
||||
print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))
|
|
@ -0,0 +1,3 @@
|
|||
BLEU Score: 0.0585
|
||||
Validation Accuracy: 0.6740
|
||||
Validation Loss: 1.8080
|
|
@ -0,0 +1,178 @@
|
|||
[
|
||||
{
|
||||
"context": "raden ajeng kartini lahir pada 21 april 1879 di jepara",
|
||||
"tokens": [
|
||||
"raden",
|
||||
"ajeng",
|
||||
"kartini",
|
||||
"lahir",
|
||||
"pada",
|
||||
"21",
|
||||
"april",
|
||||
"1879",
|
||||
"di",
|
||||
"jepara"
|
||||
],
|
||||
"ner": [
|
||||
"PER",
|
||||
"PER",
|
||||
"PER",
|
||||
"O",
|
||||
"O",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"O",
|
||||
"LOC"
|
||||
],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"O",
|
||||
"ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "isian",
|
||||
"question": "Dimana kartini lahir ___",
|
||||
"answer": "jepara",
|
||||
"id": "qa_0_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kartini lahir pada tanggal 21 mei 1879 ___",
|
||||
"options": [
|
||||
"true",
|
||||
"false"
|
||||
],
|
||||
"answer": "false",
|
||||
"id": "qa_0_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
"tokens": [
|
||||
"kerajaan",
|
||||
"majapahit",
|
||||
"berdiri",
|
||||
"pada",
|
||||
"tahun",
|
||||
"1293",
|
||||
"di",
|
||||
"trowulan"
|
||||
],
|
||||
"ner": [
|
||||
"O",
|
||||
"ORG",
|
||||
"O",
|
||||
"O",
|
||||
"O",
|
||||
"DATE",
|
||||
"O",
|
||||
"LOC"
|
||||
],
|
||||
"srl": [
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"V",
|
||||
"O",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"O",
|
||||
"ARGM-LOC"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "opsi",
|
||||
"question": "Dimana kerajaan majapahit berdiri ___",
|
||||
"options": [
|
||||
"trowulan",
|
||||
"singasari",
|
||||
"kuta",
|
||||
"banten"
|
||||
],
|
||||
"answer": "trowulan",
|
||||
"id": "qa_1_q1"
|
||||
},
|
||||
{
|
||||
"type": "true_false",
|
||||
"question": "Kerajaan majapahit berdiri pada tahun 1300 ___",
|
||||
"options": [
|
||||
"true",
|
||||
"false"
|
||||
],
|
||||
"answer": "false",
|
||||
"id": "qa_1_q2"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"context": "soekarno dan mohammad hatta memproklamasikan kemerdekaan indonesia pada 17 agustus 1945",
|
||||
"tokens": [
|
||||
"soekarno",
|
||||
"dan",
|
||||
"mohammad",
|
||||
"hatta",
|
||||
"memproklamasikan",
|
||||
"kemerdekaan",
|
||||
"indonesia",
|
||||
"pada",
|
||||
"17",
|
||||
"agustus",
|
||||
"1945"
|
||||
],
|
||||
"ner": [
|
||||
"PER",
|
||||
"O",
|
||||
"PER",
|
||||
"PER",
|
||||
"O",
|
||||
"O",
|
||||
"LOC",
|
||||
"O",
|
||||
"DATE",
|
||||
"DATE",
|
||||
"DATE"
|
||||
],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"O",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"ARG1",
|
||||
"ARGM-LOC",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP"
|
||||
],
|
||||
"qas": [
|
||||
{
|
||||
"type": "isian",
|
||||
"question": "Pada tanggal berapa kemerdekaan indonesia diproklamasikan ___",
|
||||
"answer": "17 agustus 1945",
|
||||
"id": "qa_2_q1"
|
||||
},
|
||||
{
|
||||
"type": "opsi",
|
||||
"question": "Siapa yang memproklamasikan kemerdekaan indonesia ___",
|
||||
"options": [
|
||||
"soekarno",
|
||||
"mohammad hatta",
|
||||
"sudirman",
|
||||
"ahmad yani"
|
||||
],
|
||||
"answer": "soekarno mohammad hatta",
|
||||
"id": "qa_2_q2"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,490 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Attention,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
|
||||
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model
|
||||
def prepare_data(data):
|
||||
"""Siapkan data untuk model"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
answers = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
answers.append(qa["answer"])
|
||||
q_types.append(qa["type"])
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, answers, q_types
|
||||
|
||||
|
||||
contexts, tokens_list, ner_list, srl_list, questions, answers, q_types = prepare_data(
|
||||
data
|
||||
)
|
||||
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
tokenizer.fit_on_texts(contexts + questions + [" ".join(item) for item in tokens_list])
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
return context_padded, question_padded, token_padded, ner_padded, srl_padded
|
||||
|
||||
|
||||
# Siapkan encoder untuk jawaban
|
||||
answer_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
answer_tokenizer.fit_on_texts(answers)
|
||||
answer_vocab_size = len(answer_tokenizer.word_index) + 1
|
||||
|
||||
# Encode tipe pertanyaan - FIX - Menggunakan indeks langsung bukan sequence
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
# Dapatkan indeks tipe pertanyaan (dikurangi 1 karena indeks dimulai dari 1)
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, question_padded, token_padded, ner_padded, srl_padded = (
|
||||
pad_all_sequences(context_seqs, question_seqs, token_seqs, ner_seqs, srl_seqs)
|
||||
)
|
||||
|
||||
# Encode jawaban
|
||||
answer_seqs = answer_tokenizer.texts_to_sequences(answers)
|
||||
max_answer_len = max([len(seq) for seq in answer_seqs])
|
||||
answer_padded = pad_sequences(answer_seqs, maxlen=max_answer_len, padding="post")
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_answer = get_subset(answer_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_answer = get_subset(answer_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model dengan dua output: pertanyaan dan jawaban
|
||||
def create_qa_generator_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
|
||||
# Tidak perlu question_input dan q_type_input untuk proses generasi
|
||||
# karena ini akan menjadi output
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
# Pool outputs
|
||||
context_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_pool")(context_lstm)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features
|
||||
all_features = Concatenate(name="all_features")([context_pool, token_pool])
|
||||
|
||||
# Shared layers
|
||||
shared = Dense(256, activation="relu", name="shared_dense_1")(all_features)
|
||||
shared = Dropout(dropout_rate)(shared)
|
||||
shared = Dense(128, activation="relu", name="shared_dense_2")(shared)
|
||||
shared = Dropout(dropout_rate)(shared)
|
||||
|
||||
# Branch untuk pertanyaan
|
||||
question_branch = Dense(256, activation="relu", name="question_dense")(shared)
|
||||
question_branch = Dropout(dropout_rate)(question_branch)
|
||||
|
||||
# Branch untuk jawaban
|
||||
answer_branch = Dense(256, activation="relu", name="answer_dense")(shared)
|
||||
answer_branch = Dropout(dropout_rate)(answer_branch)
|
||||
|
||||
# Output layers
|
||||
# Untuk pertanyaan, kita buat layer decoder berbasis LSTM untuk menghasilkan sequence kata
|
||||
# sebagai pertanyaan
|
||||
question_decoder = LSTM(lstm_units, return_sequences=True, name="question_decoder")(
|
||||
tf.keras.layers.RepeatVector(max_question_len)(question_branch)
|
||||
)
|
||||
question_output = Dense(vocab_size, activation="softmax", name="question_output")(
|
||||
question_decoder
|
||||
)
|
||||
|
||||
# Output layer untuk jawaban
|
||||
answer_output = Dense(
|
||||
answer_vocab_size, activation="softmax", name="answer_output"
|
||||
)(answer_branch)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
],
|
||||
outputs=[question_output, answer_output],
|
||||
)
|
||||
|
||||
# Compile model dengan loss function dan metrics untuk kedua output
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss={
|
||||
"question_output": "categorical_crossentropy",
|
||||
"answer_output": "sparse_categorical_crossentropy",
|
||||
},
|
||||
metrics={"question_output": "accuracy", "answer_output": "accuracy"},
|
||||
loss_weights={"question_output": 1.0, "answer_output": 1.0},
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Persiapkan target untuk pertanyaan (one-hot encoded)
|
||||
# Untuk pertanyaan, kita perlu mengubah ke format categorical karena kita memprediksi
|
||||
# setiap kata di sequence secara bersamaan
|
||||
def prepare_question_target(question_padded):
|
||||
question_target = []
|
||||
for question in question_padded:
|
||||
# One-hot encode setiap token dalam sequence
|
||||
sequence_target = []
|
||||
for token in question:
|
||||
# Buat vektor one-hot untuk token ini
|
||||
token_target = tf.keras.utils.to_categorical(token, num_classes=vocab_size)
|
||||
sequence_target.append(token_target)
|
||||
question_target.append(sequence_target)
|
||||
return np.array(question_target)
|
||||
|
||||
|
||||
# Siapkan target untuk question output
|
||||
train_question_target = prepare_question_target(train_question)
|
||||
test_question_target = prepare_question_target(test_question)
|
||||
|
||||
# Ubah format jawaban untuk sparse categorical crossentropy
|
||||
train_answer_labels = train_answer[:, 0] # Ambil indeks pertama dari jawaban
|
||||
test_answer_labels = test_answer[:, 0]
|
||||
|
||||
# Buat model
|
||||
model = create_qa_generator_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"qa_generator_model.h5",
|
||||
monitor="val_question_output_accuracy",
|
||||
save_best_only=True,
|
||||
verbose=1,
|
||||
mode="max",
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(
|
||||
monitor="val_question_output_accuracy", patience=5, verbose=1, mode="max"
|
||||
)
|
||||
|
||||
# Training
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_token, train_ner, train_srl],
|
||||
{"question_output": train_question_target, "answer_output": train_answer_labels},
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_token, test_ner, test_srl],
|
||||
{"question_output": test_question_target, "answer_output": test_answer_labels},
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
model.save("qa_generator_model_final.keras")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"answer_tokenizer": answer_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("qa_generator_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
|
||||
# Fungsi untuk prediksi
|
||||
def predict_question_and_answer(model, context, tokens, ner, srl):
|
||||
"""
|
||||
Prediksi pertanyaan dan jawaban berdasarkan konteks, tokens, NER, dan SRL
|
||||
"""
|
||||
# Preprocess input
|
||||
context_seq = tokenizer.texts_to_sequences([preprocess_text(context)])
|
||||
context_padded = pad_sequences(context_seq, maxlen=max_context_len, padding="post")
|
||||
|
||||
token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])
|
||||
token_padded = pad_sequences(token_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])
|
||||
ner_padded = pad_sequences(ner_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])
|
||||
srl_padded = pad_sequences(srl_seq, maxlen=max_token_len, padding="post")
|
||||
|
||||
# Prediksi
|
||||
question_pred, answer_pred = model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded]
|
||||
)
|
||||
|
||||
# Decode pertanyaan (mengambil indeks dengan probabilitas tertinggi di setiap posisi)
|
||||
question_indices = np.argmax(question_pred[0], axis=1)
|
||||
question_words = []
|
||||
|
||||
# Reverse word index untuk mendapatkan kata dari indeks
|
||||
word_index = tokenizer.word_index
|
||||
index_word = {v: k for k, v in word_index.items()}
|
||||
|
||||
# Decode pertanyaan
|
||||
for idx in question_indices:
|
||||
if idx != 0: # Skip padding (index 0)
|
||||
word = index_word.get(idx, "<UNK>")
|
||||
question_words.append(word)
|
||||
else:
|
||||
break # Stop at padding
|
||||
|
||||
# Decode jawaban
|
||||
answer_idx = np.argmax(answer_pred[0])
|
||||
|
||||
# Reverse word index untuk jawaban
|
||||
answer_word_index = answer_tokenizer.word_index
|
||||
answer_index_word = {v: k for k, v in answer_word_index.items()}
|
||||
|
||||
answer = answer_index_word.get(answer_idx, "<UNK>")
|
||||
|
||||
# Bentuk pertanyaan
|
||||
question = " ".join(question_words)
|
||||
|
||||
return question, answer
|
||||
|
||||
|
||||
# Contoh penggunaan
|
||||
# Catatan: Ini hanya contoh, perlu data aktual saat implementasi
|
||||
"""
|
||||
sample_context = "Selamat pagi, sekarang adalah hari Senin."
|
||||
sample_tokens = ["selamat", "pagi", "sekarang", "adalah", "hari", "senin"]
|
||||
sample_ner = ["O", "O", "O", "O", "O", "B-TIME"]
|
||||
sample_srl = ["B-V", "B-ARG1", "B-ARGM-TMP", "B-ARGM-PRD", "I-ARGM-PRD", "I-ARGM-PRD"]
|
||||
|
||||
# Load model yang sudah dilatih
|
||||
loaded_model = load_model("qa_generator_model_final.keras")
|
||||
|
||||
# Prediksi
|
||||
question, answer = predict_question_and_answer(
|
||||
loaded_model, sample_context, sample_tokens, sample_ner, sample_srl
|
||||
)
|
||||
|
||||
print("Konteks:", sample_context)
|
||||
print("Pertanyaan yang dihasilkan:", question)
|
||||
print("Jawaban yang dihasilkan:", answer)
|
||||
"""
|
||||
|
||||
sample = {
|
||||
"context": "kerajaan majapahit berdiri pada tahun 1293 di trowulan",
|
||||
"tokens": [
|
||||
"kerajaan",
|
||||
"majapahit",
|
||||
"berdiri",
|
||||
"pada",
|
||||
"tahun",
|
||||
"1293",
|
||||
"di",
|
||||
"trowulan",
|
||||
],
|
||||
"ner": ["O", "ORG", "O", "O", "O", "DATE", "O", "LOC"],
|
||||
"srl": ["ARG1", "ARG1", "V", "O", "O", "ARGM-TMP", "O", "ARGM-LOC"],
|
||||
}
|
||||
question, answer = predict_question_and_answer(
|
||||
model, sample["context"], sample["tokens"], sample["ner"], sample["srl"]
|
||||
)
|
||||
|
||||
print("Konteks:", sample["context"])
|
||||
print("Pertanyaan yang dihasilkan:", question)
|
||||
print("Jawaban yang dihasilkan:", answer)
|
||||
|
||||
# Plot history training
|
||||
# plt.figure(figsize=(12, 8))
|
||||
|
||||
# # Plot loss
|
||||
# plt.subplot(2, 2, 1)
|
||||
# plt.plot(history.history['loss'])
|
||||
# plt.plot(history.history['val_loss'])
|
||||
# plt.title('Model Loss')
|
||||
# plt.ylabel('Loss')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='upper right')
|
||||
|
||||
# # Plot question output accuracy
|
||||
# plt.subplot(2, 2, 2)
|
||||
# plt.plot(history.history['question_output_accuracy'])
|
||||
# plt.plot(history.history['val_question_output_accuracy'])
|
||||
# plt.title('Question Output Accuracy')
|
||||
# plt.ylabel('Accuracy')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='lower right')
|
||||
|
||||
# # Plot answer output accuracy
|
||||
# plt.subplot(2, 2, 3)
|
||||
# plt.plot(history.history['answer_output_accuracy'])
|
||||
# plt.plot(history.history['val_answer_output_accuracy'])
|
||||
# plt.title('Answer Output Accuracy')
|
||||
# plt.ylabel('Accuracy')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='lower right')
|
||||
|
||||
# plt.tight_layout()
|
||||
# plt.savefig("training_history.png")
|
||||
# plt.show()
|
|
@ -0,0 +1,308 @@
|
|||
#!/usr/bin/env python3
|
||||
# ===============================================================
|
||||
# Question‑Generation seq‑to‑seq (tokens + NER + SRL → Q/A/type)
|
||||
# – revised version 2025‑05‑11
|
||||
# ===============================================================
|
||||
|
||||
import json, pickle, random
|
||||
from pathlib import Path
|
||||
from itertools import chain
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.layers import (
|
||||
Input, Embedding, LSTM, Concatenate,
|
||||
Dense, TimeDistributed
|
||||
)
|
||||
from tensorflow.keras.models import Model
|
||||
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
|
||||
from rouge_score import rouge_scorer, scoring
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 0. LOAD & FLATTEN DATA
|
||||
# -----------------------------------------------------------------
|
||||
RAW = json.loads(Path("../dataset/dev_dataset_qg.json").read_text())
|
||||
|
||||
samples = []
|
||||
for item in RAW:
|
||||
for qp in item["quiz_posibility"]:
|
||||
samples.append({
|
||||
"tokens" : item["tokens"],
|
||||
"ner" : item["ner"],
|
||||
"srl" : item["srl"],
|
||||
"q_type" : qp["type"], # isian / opsi / benar_salah
|
||||
"q_toks" : qp["question"] + ["<eos>"],
|
||||
"a_toks" : (qp["answer"] if isinstance(qp["answer"], list)
|
||||
else [qp["answer"]]) + ["<eos>"]
|
||||
})
|
||||
|
||||
print("flattened samples :", len(samples))
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 1. VOCABULARIES
|
||||
# -----------------------------------------------------------------
|
||||
def build_vocab(seq_iter, reserved=("<pad>", "<unk>", "<sos>", "<eos>")):
|
||||
vocab = {tok: idx for idx, tok in enumerate(reserved)}
|
||||
for tok in chain.from_iterable(seq_iter):
|
||||
vocab.setdefault(tok, len(vocab))
|
||||
return vocab
|
||||
|
||||
vocab_tok = build_vocab((s["tokens"] for s in samples))
|
||||
vocab_ner = build_vocab((s["ner"] for s in samples), reserved=("<pad>","<unk>"))
|
||||
vocab_srl = build_vocab((s["srl"] for s in samples), reserved=("<pad>","<unk>"))
|
||||
vocab_q = build_vocab((s["q_toks"] for s in samples))
|
||||
vocab_a = build_vocab((s["a_toks"] for s in samples))
|
||||
vocab_typ = {"isian":0, "opsi":1, "benar_salah":2}
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 2. ENCODING & PADDING
|
||||
# -----------------------------------------------------------------
|
||||
def enc(seq, v): return [v.get(t, v["<unk>"]) for t in seq]
|
||||
|
||||
MAX_SENT = max(len(s["tokens"]) for s in samples)
|
||||
MAX_Q = max(len(s["q_toks"]) for s in samples)
|
||||
MAX_A = max(len(s["a_toks"]) for s in samples)
|
||||
|
||||
def pad_batch(seqs, vmap, maxlen):
|
||||
return tf.keras.preprocessing.sequence.pad_sequences(
|
||||
[enc(s, vmap) for s in seqs], maxlen=maxlen, padding="post"
|
||||
)
|
||||
|
||||
X_tok = pad_batch((s["tokens"] for s in samples), vocab_tok, MAX_SENT)
|
||||
X_ner = pad_batch((s["ner"] for s in samples), vocab_ner, MAX_SENT)
|
||||
X_srl = pad_batch((s["srl"] for s in samples), vocab_srl, MAX_SENT)
|
||||
|
||||
dec_q_in = pad_batch(
|
||||
([["<sos>"]+s["q_toks"][:-1] for s in samples]), vocab_q, MAX_Q)
|
||||
dec_q_out = pad_batch((s["q_toks"] for s in samples), vocab_q, MAX_Q)
|
||||
|
||||
dec_a_in = pad_batch(
|
||||
([["<sos>"]+s["a_toks"][:-1] for s in samples]), vocab_a, MAX_A)
|
||||
dec_a_out = pad_batch((s["a_toks"] for s in samples), vocab_a, MAX_A)
|
||||
|
||||
y_type = np.array([vocab_typ[s["q_type"]] for s in samples])
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 3. MODEL
|
||||
# -----------------------------------------------------------------
|
||||
d_tok, d_tag, units = 128, 32, 256
|
||||
pad_tok, pad_q, pad_a = vocab_tok["<pad>"], vocab_q["<pad>"], vocab_a["<pad>"]
|
||||
|
||||
# ---- Encoder ----------------------------------------------------
|
||||
inp_tok = Input((MAX_SENT,), name="tok_in")
|
||||
inp_ner = Input((MAX_SENT,), name="ner_in")
|
||||
inp_srl = Input((MAX_SENT,), name="srl_in")
|
||||
|
||||
emb_tok = Embedding(len(vocab_tok), d_tok, mask_zero=True, name="emb_tok")(inp_tok)
|
||||
emb_ner = Embedding(len(vocab_ner), d_tag, mask_zero=True, name="emb_ner")(inp_ner)
|
||||
emb_srl = Embedding(len(vocab_srl), d_tag, mask_zero=True, name="emb_srl")(inp_srl)
|
||||
|
||||
enc_concat = Concatenate()([emb_tok, emb_ner, emb_srl])
|
||||
enc_out, state_h, state_c = LSTM(units, return_state=True, name="enc_lstm")(enc_concat)
|
||||
|
||||
# ---- Decoder : Question ----------------------------------------
|
||||
dec_q_inp = Input((MAX_Q,), name="dec_q_in")
|
||||
dec_emb_q = Embedding(len(vocab_q), d_tok, mask_zero=True, name="emb_q")(dec_q_inp)
|
||||
dec_q_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,
|
||||
name="lstm_q")(dec_emb_q, initial_state=[state_h, state_c])
|
||||
q_out = TimeDistributed(Dense(len(vocab_q), activation="softmax"), name="q_out")(dec_q_seq)
|
||||
|
||||
# ---- Decoder : Answer ------------------------------------------
|
||||
dec_a_inp = Input((MAX_A,), name="dec_a_in")
|
||||
dec_emb_a = Embedding(len(vocab_a), d_tok, mask_zero=True, name="emb_a")(dec_a_inp)
|
||||
dec_a_seq, _, _ = LSTM(units, return_sequences=True, return_state=True,
|
||||
name="lstm_a")(dec_emb_a, initial_state=[state_h, state_c])
|
||||
a_out = TimeDistributed(Dense(len(vocab_a), activation="softmax"), name="a_out")(dec_a_seq)
|
||||
|
||||
# ---- Classifier -------------------------------------------------
|
||||
type_out = Dense(len(vocab_typ), activation="softmax", name="type_out")(enc_out)
|
||||
|
||||
model = Model(
|
||||
[inp_tok, inp_ner, inp_srl, dec_q_inp, dec_a_inp],
|
||||
[q_out, a_out, type_out]
|
||||
)
|
||||
|
||||
# ---- Masked loss helpers ---------------------------------------
|
||||
scce = tf.keras.losses.SparseCategoricalCrossentropy(reduction="none")
|
||||
def masked_loss_factory(pad_id):
|
||||
def loss(y_true, y_pred):
|
||||
l = scce(y_true, y_pred)
|
||||
mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
|
||||
return tf.reduce_sum(l*mask) / tf.reduce_sum(mask)
|
||||
return loss
|
||||
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss = {"q_out":masked_loss_factory(pad_q),
|
||||
"a_out":masked_loss_factory(pad_a),
|
||||
"type_out":"sparse_categorical_crossentropy"},
|
||||
loss_weights={"q_out":1.0, "a_out":1.0, "type_out":0.3},
|
||||
metrics={"q_out":"sparse_categorical_accuracy",
|
||||
"a_out":"sparse_categorical_accuracy",
|
||||
"type_out":tf.keras.metrics.SparseCategoricalAccuracy(name="type_acc")}
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 4. TRAIN
|
||||
# -----------------------------------------------------------------
|
||||
history = model.fit(
|
||||
[X_tok, X_ner, X_srl, dec_q_in, dec_a_in],
|
||||
[dec_q_out, dec_a_out, y_type],
|
||||
validation_split=0.1,
|
||||
epochs=30,
|
||||
batch_size=64,
|
||||
callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],
|
||||
verbose=2
|
||||
)
|
||||
model.save("full_seq2seq.keras")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 5. SAVE VOCABS (.pkl keeps python dict intact)
|
||||
# -----------------------------------------------------------------
|
||||
def save_vocab(v, name): pickle.dump(v, open(name,"wb"))
|
||||
save_vocab(vocab_tok,"vocab_tok.pkl"); save_vocab(vocab_ner,"vocab_ner.pkl")
|
||||
save_vocab(vocab_srl,"vocab_srl.pkl"); save_vocab(vocab_q, "vocab_q.pkl")
|
||||
save_vocab(vocab_a, "vocab_a.pkl"); save_vocab(vocab_typ,"vocab_typ.pkl")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 6. INFERENCE MODELS (encoder & decoders)
|
||||
# -----------------------------------------------------------------
|
||||
def build_inference_models(trained):
|
||||
# encoder
|
||||
t_in = Input((MAX_SENT,), name="t_in")
|
||||
n_in = Input((MAX_SENT,), name="n_in")
|
||||
s_in = Input((MAX_SENT,), name="s_in")
|
||||
e_t = trained.get_layer("emb_tok")(t_in)
|
||||
e_n = trained.get_layer("emb_ner")(n_in)
|
||||
e_s = trained.get_layer("emb_srl")(s_in)
|
||||
concat = Concatenate()([e_t,e_n,e_s])
|
||||
_, h, c = trained.get_layer("enc_lstm")(concat)
|
||||
enc_model = Model([t_in,n_in,s_in],[h,c])
|
||||
|
||||
# question‑decoder
|
||||
dq_in = Input((1,), name="dq_tok")
|
||||
dh = Input((units,), name="dh"); dc = Input((units,), name="dc")
|
||||
dq_emb = trained.get_layer("emb_q")(dq_in)
|
||||
dq_lstm, nh, nc = trained.get_layer("lstm_q")(dq_emb, initial_state=[dh,dc])
|
||||
dq_out = trained.get_layer("q_out").layer(dq_lstm)
|
||||
dec_q_model = Model([dq_in, dh, dc], [dq_out, nh, nc])
|
||||
|
||||
# answer‑decoder
|
||||
da_in = Input((1,), name="da_tok")
|
||||
ah = Input((units,), name="ah"); ac = Input((units,), name="ac")
|
||||
da_emb = trained.get_layer("emb_a")(da_in)
|
||||
da_lstm, nh2, nc2 = trained.get_layer("lstm_a")(da_emb, initial_state=[ah,ac])
|
||||
da_out = trained.get_layer("a_out").layer(da_lstm)
|
||||
dec_a_model = Model([da_in, ah, ac], [da_out, nh2, nc2])
|
||||
|
||||
# type classifier
|
||||
type_dense = trained.get_layer("type_out")
|
||||
type_model = Model([t_in,n_in,s_in], type_dense(_)) # use _ = enc_lstm output
|
||||
|
||||
return enc_model, dec_q_model, dec_a_model, type_model
|
||||
|
||||
encoder_model, decoder_q, decoder_a, classifier_model = build_inference_models(model)
|
||||
|
||||
inv_q = {v:k for k,v in vocab_q.items()}
|
||||
inv_a = {v:k for k,v in vocab_a.items()}
|
||||
|
||||
def enc_pad(seq, vmap, maxlen):
|
||||
x = [vmap.get(t, vmap["<unk>"]) for t in seq]
|
||||
return x + [vmap["<pad>"]] * (maxlen-len(x))
|
||||
|
||||
def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):
|
||||
et = np.array([enc_pad(tokens, vocab_tok, MAX_SENT)])
|
||||
en = np.array([enc_pad(ner, vocab_ner, MAX_SENT)])
|
||||
es = np.array([enc_pad(srl, vocab_srl, MAX_SENT)])
|
||||
|
||||
h,c = encoder_model.predict([et,en,es], verbose=0)
|
||||
|
||||
# --- question
|
||||
q_ids = []
|
||||
tgt = np.array([[vocab_q["<sos>"]]])
|
||||
for _ in range(max_q):
|
||||
logits,h,c = decoder_q.predict([tgt,h,c], verbose=0)
|
||||
nxt = int(logits[0,-1].argmax())
|
||||
if nxt==vocab_q["<eos>"]: break
|
||||
q_ids.append(nxt)
|
||||
tgt = np.array([[nxt]])
|
||||
|
||||
# --- answer (re‑use fresh h,c)
|
||||
h,c = encoder_model.predict([et,en,es], verbose=0)
|
||||
a_ids = []
|
||||
tgt = np.array([[vocab_a["<sos>"]]])
|
||||
for _ in range(max_a):
|
||||
logits,h,c = decoder_a.predict([tgt,h,c], verbose=0)
|
||||
nxt = int(logits[0,-1].argmax())
|
||||
if nxt==vocab_a["<eos>"]: break
|
||||
a_ids.append(nxt)
|
||||
tgt = np.array([[nxt]])
|
||||
|
||||
# --- type
|
||||
t_id = int(classifier_model.predict([et,en,es], verbose=0).argmax())
|
||||
|
||||
return [inv_q[i] for i in q_ids], [inv_a[i] for i in a_ids], \
|
||||
[k for k,v in vocab_typ.items() if v==t_id][0]
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 7. QUICK DEMO
|
||||
# -----------------------------------------------------------------
|
||||
test_tokens = ["soekarno","membacakan","teks","proklamasi","pada",
|
||||
"17","agustus","1945"]
|
||||
test_ner = ["B-PER","O","O","O","O","B-DATE","I-DATE","I-DATE"]
|
||||
test_srl = ["ARG0","V","ARG1","ARG1","O","ARGM-TMP","ARGM-TMP","ARGM-TMP"]
|
||||
|
||||
q,a,t = greedy_decode(test_tokens,test_ner,test_srl,max_q=MAX_Q,max_a=MAX_A)
|
||||
print("\nDEMO\n----")
|
||||
print("Q :", " ".join(q))
|
||||
print("A :", " ".join(a))
|
||||
print("T :", t)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# 8. EVALUATION (corpus‑level BLEU + ROUGE‑1/‑L)
|
||||
# -----------------------------------------------------------------
|
||||
smooth = SmoothingFunction().method4
|
||||
r_scorer = rouge_scorer.RougeScorer(["rouge1","rougeL"], use_stemmer=True)
|
||||
|
||||
def strip_special(seq, pad_id, eos_id):
|
||||
return [x for x in seq if x not in (pad_id, eos_id)]
|
||||
|
||||
def ids_to_text(ids, inv):
|
||||
return " ".join(inv[i] for i in ids)
|
||||
|
||||
def evaluate(n=200):
|
||||
idxs = random.sample(range(len(samples)), n)
|
||||
refs, hyps = [], []
|
||||
agg = scoring.BootstrapAggregator()
|
||||
|
||||
for i in idxs:
|
||||
gt_ids = strip_special(dec_q_out[i], pad_q, vocab_q["<eos>"])
|
||||
ref = ids_to_text(gt_ids, inv_q)
|
||||
pred = " ".join(greedy_decode(
|
||||
samples[i]["tokens"],
|
||||
samples[i]["ner"],
|
||||
samples[i]["srl"]
|
||||
)[0])
|
||||
refs.append([ref.split()])
|
||||
hyps.append(pred.split())
|
||||
agg.add_scores(r_scorer.score(ref, pred))
|
||||
|
||||
bleu = corpus_bleu(refs, hyps, smoothing_function=smooth)
|
||||
r1 = agg.aggregate()["rouge1"].mid
|
||||
rL = agg.aggregate()["rougeL"].mid
|
||||
|
||||
print(f"\nEVAL (n={n})")
|
||||
print(f"BLEU‑4 : {bleu:.4f}")
|
||||
print(f"ROUGE‑1 : P={r1.precision:.3f} R={r1.recall:.3f} F1={r1.fmeasure:.3f}")
|
||||
print(f"ROUGE‑L : P={rL.precision:.3f} R={rL.recall:.3f} F1={rL.fmeasure:.3f}")
|
||||
|
||||
evaluate(2) # run on 150 random samples
|
|
@ -0,0 +1,357 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Attention,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
import string
|
||||
from collections import Counter
|
||||
|
||||
# Load data
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model prediksi pertanyaan
|
||||
def prepare_question_prediction_data(data):
|
||||
"""Siapkan data untuk model prediksi pertanyaan"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
answers = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
q_types.append(qa["type"])
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, q_types
|
||||
|
||||
|
||||
# Siapkan data
|
||||
contexts, tokens_list, ner_list, srl_list, questions, q_types = (
|
||||
prepare_question_prediction_data(data)
|
||||
)
|
||||
|
||||
# Tokenizer untuk teks (context, question, answer)
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
|
||||
tokenizer.fit_on_texts(all_texts)
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Sequences
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
return (
|
||||
context_padded,
|
||||
token_padded,
|
||||
ner_padded,
|
||||
srl_padded,
|
||||
question_padded,
|
||||
)
|
||||
|
||||
|
||||
# Encode tipe pertanyaan
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, token_padded, ner_padded, srl_padded, question_padded = (
|
||||
pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
|
||||
)
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model prediksi pertanyaan
|
||||
def create_question_prediction_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
||||
context_lstm
|
||||
)
|
||||
|
||||
# Pool attention outputs
|
||||
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
||||
context_attention
|
||||
)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features
|
||||
all_features = Concatenate(name="all_features")(
|
||||
[context_att_pool, token_pool, q_type_input]
|
||||
)
|
||||
|
||||
# Dense layers with expanded capacity for sequence generation
|
||||
x = Dense(512, activation="relu", name="dense_1")(all_features)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
x = Dense(256, activation="relu", name="dense_2")(x)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
|
||||
# Reshape untuk sequence decoder
|
||||
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
|
||||
|
||||
# Many-to-many architecture for sequence generation
|
||||
# Decoder LSTM
|
||||
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
|
||||
|
||||
# Reshape untuk input ke decoder
|
||||
decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
|
||||
decoder_input_reshaped = tf.keras.layers.Reshape((1, lstm_units * 2))(decoder_input)
|
||||
|
||||
# Decoder sequence with teacher forcing
|
||||
# Expand dimensionality to match expected sequence length
|
||||
repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
|
||||
|
||||
# Process through decoder LSTM
|
||||
decoder_outputs = decoder_lstm(repeated_vector)
|
||||
|
||||
# Apply dense layer to each timestep
|
||||
question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
|
||||
decoder_outputs
|
||||
)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
q_type_input,
|
||||
],
|
||||
outputs=question_output_seq,
|
||||
)
|
||||
|
||||
# Compile model with categorical crossentropy for sequence prediction
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Buat model
|
||||
model = create_question_prediction_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"question_prediction_model.h5",
|
||||
monitor="val_accuracy",
|
||||
save_best_only=True,
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
|
||||
|
||||
# Reshaping question data for sequence-to-sequence training
|
||||
# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
|
||||
train_question_target = np.expand_dims(train_question, -1)
|
||||
test_question_target = np.expand_dims(test_question, -1)
|
||||
|
||||
# Training parameters
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_token, train_ner, train_srl, train_q_type],
|
||||
train_question_target,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_token, test_ner, test_srl, test_q_type],
|
||||
test_question_target,
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
# # Plot training history
|
||||
# plt.figure(figsize=(12, 4))
|
||||
# plt.subplot(1, 2, 1)
|
||||
# plt.plot(history.history['accuracy'])
|
||||
# plt.plot(history.history['val_accuracy'])
|
||||
# plt.title('Model Accuracy')
|
||||
# plt.ylabel('Accuracy')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='upper left')
|
||||
|
||||
# plt.subplot(1, 2, 2)
|
||||
# plt.plot(history.history['loss'])
|
||||
# plt.plot(history.history['val_loss'])
|
||||
# plt.title('Model Loss')
|
||||
# plt.ylabel('Loss')
|
||||
# plt.xlabel('Epoch')
|
||||
# plt.legend(['Train', 'Validation'], loc='upper left')
|
||||
# plt.tight_layout()
|
||||
# plt.savefig('question_prediction_training_history.png')
|
||||
# plt.show()
|
||||
|
||||
# Simpan model dan tokenizer
|
||||
model.save("question_prediction_model_final.h5")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("question_prediction_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
|
|
@ -0,0 +1,473 @@
|
|||
import numpy as np
|
||||
import json
|
||||
import random
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import Model, load_model
|
||||
from tensorflow.keras.layers import (
|
||||
Input,
|
||||
LSTM,
|
||||
Dense,
|
||||
Embedding,
|
||||
Bidirectional,
|
||||
Concatenate,
|
||||
Dropout,
|
||||
)
|
||||
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
from rouge_score import rouge_scorer
|
||||
from nltk.translate.bleu_score import sentence_bleu
|
||||
|
||||
# Load data
|
||||
with open("data_converted.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
# Preprocessing function
|
||||
def preprocess_text(text):
|
||||
"""Melakukan preprocessing teks dasar"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
# Persiapkan data untuk model prediksi pertanyaan
|
||||
def prepare_question_prediction_data(data):
|
||||
"""Siapkan data untuk model prediksi pertanyaan"""
|
||||
contexts = []
|
||||
tokens_list = []
|
||||
ner_list = []
|
||||
srl_list = []
|
||||
questions = []
|
||||
q_types = []
|
||||
|
||||
for item in data:
|
||||
for qa in item["qas"]:
|
||||
contexts.append(preprocess_text(item["context"]))
|
||||
tokens_list.append(item["tokens"])
|
||||
ner_list.append(item["ner"])
|
||||
srl_list.append(item["srl"])
|
||||
questions.append(preprocess_text(qa["question"]))
|
||||
q_types.append(qa["type"])
|
||||
# Tidak mengambil jawaban (answer) sebagai input
|
||||
|
||||
return contexts, tokens_list, ner_list, srl_list, questions, q_types
|
||||
|
||||
|
||||
# Siapkan data
|
||||
contexts, tokens_list, ner_list, srl_list, questions, q_types = (
|
||||
prepare_question_prediction_data(data)
|
||||
)
|
||||
|
||||
# Tokenizer untuk teks (context, question)
|
||||
max_vocab_size = 10000
|
||||
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
|
||||
all_texts = contexts + questions + [" ".join(item) for item in tokens_list]
|
||||
tokenizer.fit_on_texts(all_texts)
|
||||
vocab_size = len(tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk NER
|
||||
ner_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
ner_tokenizer.fit_on_texts([" ".join(ner) for ner in ner_list])
|
||||
ner_vocab_size = len(ner_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk SRL
|
||||
srl_tokenizer = Tokenizer(oov_token="<OOV>")
|
||||
srl_tokenizer.fit_on_texts([" ".join(srl) for srl in srl_list])
|
||||
srl_vocab_size = len(srl_tokenizer.word_index) + 1
|
||||
|
||||
# Encoding untuk tipe pertanyaan
|
||||
q_type_tokenizer = Tokenizer()
|
||||
q_type_tokenizer.fit_on_texts(q_types)
|
||||
q_type_vocab_size = len(q_type_tokenizer.word_index) + 1
|
||||
|
||||
|
||||
# Konversi token, ner, srl ke sequences
|
||||
def tokens_to_sequences(tokens, ner, srl):
|
||||
"""Konversi token, ner, dan srl ke sequences"""
|
||||
token_seqs = [tokenizer.texts_to_sequences([" ".join(t)])[0] for t in tokens]
|
||||
ner_seqs = [ner_tokenizer.texts_to_sequences([" ".join(n)])[0] for n in ner]
|
||||
srl_seqs = [srl_tokenizer.texts_to_sequences([" ".join(s)])[0] for s in srl]
|
||||
return token_seqs, ner_seqs, srl_seqs
|
||||
|
||||
|
||||
# Sequences
|
||||
context_seqs = tokenizer.texts_to_sequences(contexts)
|
||||
question_seqs = tokenizer.texts_to_sequences(questions)
|
||||
token_seqs, ner_seqs, srl_seqs = tokens_to_sequences(tokens_list, ner_list, srl_list)
|
||||
|
||||
# Menentukan panjang maksimum untuk padding
|
||||
max_context_len = max([len(seq) for seq in context_seqs])
|
||||
max_question_len = max([len(seq) for seq in question_seqs])
|
||||
max_token_len = max([len(seq) for seq in token_seqs])
|
||||
|
||||
|
||||
# Pad sequences untuk memastikan semua input sama panjang
|
||||
def pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs):
|
||||
"""Padding semua sequences"""
|
||||
context_padded = pad_sequences(context_seqs, maxlen=max_context_len, padding="post")
|
||||
token_padded = pad_sequences(token_seqs, maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences(ner_seqs, maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences(srl_seqs, maxlen=max_token_len, padding="post")
|
||||
question_padded = pad_sequences(
|
||||
question_seqs, maxlen=max_question_len, padding="post"
|
||||
)
|
||||
return (
|
||||
context_padded,
|
||||
token_padded,
|
||||
ner_padded,
|
||||
srl_padded,
|
||||
question_padded,
|
||||
)
|
||||
|
||||
|
||||
# Encode tipe pertanyaan
|
||||
q_type_indices = []
|
||||
for q_type in q_types:
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_indices.append(q_type_idx)
|
||||
|
||||
# Konversi ke numpy array
|
||||
q_type_indices = np.array(q_type_indices)
|
||||
|
||||
# One-hot encode tipe pertanyaan
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
q_type_indices, num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Pad sequences
|
||||
context_padded, token_padded, ner_padded, srl_padded, question_padded = (
|
||||
pad_all_sequences(context_seqs, token_seqs, ner_seqs, srl_seqs, question_seqs)
|
||||
)
|
||||
|
||||
# Split data menjadi train dan test sets
|
||||
indices = list(range(len(context_padded)))
|
||||
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
# Fungsi untuk mendapatkan subset dari data berdasarkan indices
|
||||
def get_subset(data, indices):
|
||||
return np.array([data[i] for i in indices])
|
||||
|
||||
|
||||
# Train data
|
||||
train_context = get_subset(context_padded, train_indices)
|
||||
train_token = get_subset(token_padded, train_indices)
|
||||
train_ner = get_subset(ner_padded, train_indices)
|
||||
train_srl = get_subset(srl_padded, train_indices)
|
||||
train_q_type = get_subset(q_type_categorical, train_indices)
|
||||
train_question = get_subset(question_padded, train_indices)
|
||||
|
||||
# Test data
|
||||
test_context = get_subset(context_padded, test_indices)
|
||||
test_token = get_subset(token_padded, test_indices)
|
||||
test_ner = get_subset(ner_padded, test_indices)
|
||||
test_srl = get_subset(srl_padded, test_indices)
|
||||
test_q_type = get_subset(q_type_categorical, test_indices)
|
||||
test_question = get_subset(question_padded, test_indices)
|
||||
|
||||
# Hyperparameters
|
||||
embedding_dim = 100
|
||||
lstm_units = 128
|
||||
ner_embedding_dim = 50
|
||||
srl_embedding_dim = 50
|
||||
dropout_rate = 0.3
|
||||
|
||||
|
||||
# Function untuk membuat model prediksi pertanyaan
|
||||
def create_question_prediction_model():
|
||||
# Input layers
|
||||
context_input = Input(shape=(max_context_len,), name="context_input")
|
||||
token_input = Input(shape=(max_token_len,), name="token_input")
|
||||
ner_input = Input(shape=(max_token_len,), name="ner_input")
|
||||
srl_input = Input(shape=(max_token_len,), name="srl_input")
|
||||
q_type_input = Input(shape=(q_type_vocab_size,), name="q_type_input")
|
||||
|
||||
# Shared embedding layer for text
|
||||
text_embedding = Embedding(vocab_size, embedding_dim, name="text_embedding")
|
||||
|
||||
# Embedding untuk NER dan SRL
|
||||
ner_embedding = Embedding(ner_vocab_size, ner_embedding_dim, name="ner_embedding")(
|
||||
ner_input
|
||||
)
|
||||
srl_embedding = Embedding(srl_vocab_size, srl_embedding_dim, name="srl_embedding")(
|
||||
srl_input
|
||||
)
|
||||
|
||||
# Apply embeddings
|
||||
context_embed = text_embedding(context_input)
|
||||
token_embed = text_embedding(token_input)
|
||||
|
||||
# Bi-directional LSTM untuk context dan token-level features
|
||||
context_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="context_lstm")
|
||||
)(context_embed)
|
||||
|
||||
# Concat token features (tokens, NER, SRL)
|
||||
token_features = Concatenate(name="token_features")(
|
||||
[token_embed, ner_embedding, srl_embedding]
|
||||
)
|
||||
token_lstm = Bidirectional(
|
||||
LSTM(lstm_units, return_sequences=True, name="token_lstm")
|
||||
)(token_features)
|
||||
|
||||
# Apply attention to context LSTM
|
||||
context_attention = tf.keras.layers.Attention(name="context_attention")(
|
||||
[context_lstm, context_lstm]
|
||||
)
|
||||
|
||||
# Pool attention outputs
|
||||
context_att_pool = tf.keras.layers.GlobalMaxPooling1D(name="context_att_pool")(
|
||||
context_attention
|
||||
)
|
||||
token_pool = tf.keras.layers.GlobalMaxPooling1D(name="token_pool")(token_lstm)
|
||||
|
||||
# Concat all features (tidak ada answer feature)
|
||||
all_features = Concatenate(name="all_features")(
|
||||
[context_att_pool, token_pool, q_type_input]
|
||||
)
|
||||
|
||||
# Dense layers with expanded capacity for sequence generation
|
||||
x = Dense(512, activation="relu", name="dense_1")(all_features)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
x = Dense(256, activation="relu", name="dense_2")(x)
|
||||
x = Dropout(dropout_rate)(x)
|
||||
|
||||
# Reshape untuk sequence decoder
|
||||
decoder_dense = Dense(vocab_size, activation="softmax", name="decoder_dense")
|
||||
|
||||
# Many-to-many architecture for sequence generation
|
||||
# Decoder LSTM
|
||||
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
|
||||
|
||||
# Reshape untuk input ke decoder
|
||||
decoder_input = Dense(lstm_units * 2, activation="relu", name="decoder_input")(x)
|
||||
|
||||
# Decoder sequence with teacher forcing
|
||||
# Expand dimensionality to match expected sequence length
|
||||
repeated_vector = tf.keras.layers.RepeatVector(max_question_len)(decoder_input)
|
||||
|
||||
# Process through decoder LSTM
|
||||
decoder_outputs = decoder_lstm(repeated_vector)
|
||||
|
||||
# Apply dense layer to each timestep
|
||||
question_output_seq = tf.keras.layers.TimeDistributed(decoder_dense)(
|
||||
decoder_outputs
|
||||
)
|
||||
|
||||
# Create model
|
||||
model = Model(
|
||||
inputs=[
|
||||
context_input,
|
||||
token_input,
|
||||
ner_input,
|
||||
srl_input,
|
||||
q_type_input,
|
||||
],
|
||||
outputs=question_output_seq,
|
||||
)
|
||||
|
||||
# Compile model with categorical crossentropy for sequence prediction
|
||||
model.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
# Buat model
|
||||
model = create_question_prediction_model()
|
||||
model.summary()
|
||||
|
||||
# Callback untuk menyimpan model terbaik
|
||||
checkpoint = ModelCheckpoint(
|
||||
"question_prediction_model.h5",
|
||||
monitor="val_accuracy",
|
||||
save_best_only=True,
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
early_stop = EarlyStopping(monitor="val_accuracy", patience=10, verbose=1)
|
||||
|
||||
# Reshaping question data for sequence-to-sequence training
|
||||
# We need to reshape to (samples, max_question_len, 1) for sparse categorical crossentropy
|
||||
train_question_target = np.expand_dims(train_question, -1)
|
||||
test_question_target = np.expand_dims(test_question, -1)
|
||||
|
||||
# Training parameters
|
||||
batch_size = 8
|
||||
epochs = 50
|
||||
|
||||
# Train model
|
||||
history = model.fit(
|
||||
[train_context, train_token, train_ner, train_srl, train_q_type],
|
||||
train_question_target,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=(
|
||||
[test_context, test_token, test_ner, test_srl, test_q_type],
|
||||
test_question_target,
|
||||
),
|
||||
callbacks=[checkpoint, early_stop],
|
||||
)
|
||||
|
||||
# Plot training history
|
||||
plt.figure(figsize=(12, 4))
|
||||
plt.subplot(1, 2, 1)
|
||||
plt.plot(history.history["accuracy"])
|
||||
plt.plot(history.history["val_accuracy"])
|
||||
plt.title("Model Accuracy")
|
||||
plt.ylabel("Accuracy")
|
||||
plt.xlabel("Epoch")
|
||||
plt.legend(["Train", "Validation"], loc="upper left")
|
||||
|
||||
plt.subplot(1, 2, 2)
|
||||
plt.plot(history.history["loss"])
|
||||
plt.plot(history.history["val_loss"])
|
||||
plt.title("Model Loss")
|
||||
plt.ylabel("Loss")
|
||||
plt.xlabel("Epoch")
|
||||
plt.legend(["Train", "Validation"], loc="upper left")
|
||||
plt.tight_layout()
|
||||
plt.savefig("question_prediction_training_history.png")
|
||||
plt.show()
|
||||
|
||||
# Simpan model dan tokenizer
|
||||
model.save("question_prediction_model_final.h5")
|
||||
|
||||
# Simpan tokenizer
|
||||
tokenizer_data = {
|
||||
"word_tokenizer": tokenizer.to_json(),
|
||||
"ner_tokenizer": ner_tokenizer.to_json(),
|
||||
"srl_tokenizer": srl_tokenizer.to_json(),
|
||||
"q_type_tokenizer": q_type_tokenizer.to_json(),
|
||||
"max_context_len": max_context_len,
|
||||
"max_question_len": max_question_len,
|
||||
"max_token_len": max_token_len,
|
||||
}
|
||||
|
||||
with open("question_prediction_tokenizers.json", "w") as f:
|
||||
json.dump(tokenizer_data, f)
|
||||
|
||||
print("Model dan tokenizer untuk prediksi pertanyaan berhasil disimpan!")
|
||||
|
||||
|
||||
# Fungsi untuk memprediksi pertanyaan
|
||||
def predict_question(context, tokens, ner, srl, q_type):
|
||||
context = preprocess_text(context)
|
||||
|
||||
context_seq = tokenizer.texts_to_sequences([context])[0]
|
||||
token_seq = tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
context_padded = pad_sequences(
|
||||
[context_seq], maxlen=max_context_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences([token_seq], maxlen=max_token_len, padding="post")
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=max_token_len, padding="post")
|
||||
|
||||
# Q-type one-hot encoding
|
||||
q_type_idx = q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_one_hot = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=q_type_vocab_size
|
||||
)
|
||||
|
||||
# Predict
|
||||
pred = model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot],
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
# Convert prediction to words
|
||||
pred_seq = np.argmax(pred[0], axis=1)
|
||||
|
||||
# Convert indices to words
|
||||
reverse_word_map = {v: k for k, v in tokenizer.word_index.items()}
|
||||
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
|
||||
|
||||
return " ".join(pred_words)
|
||||
|
||||
|
||||
def evaluate_model_performance(test_data):
|
||||
|
||||
# Initialize ROUGE scorer
|
||||
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
|
||||
|
||||
# Lists to store scores
|
||||
bleu_scores = []
|
||||
rouge1_scores = []
|
||||
rouge2_scores = []
|
||||
rougel_scores = []
|
||||
|
||||
# Iterate through test data
|
||||
for i in range(len(test_data)):
|
||||
# Get test sample
|
||||
sample_context = contexts[test_data[i]]
|
||||
sample_tokens = tokens_list[test_data[i]]
|
||||
sample_ner = ner_list[test_data[i]]
|
||||
sample_srl = srl_list[test_data[i]]
|
||||
sample_q_type = q_types[test_data[i]]
|
||||
actual_question = questions[test_data[i]]
|
||||
|
||||
# Predict question
|
||||
pred_question = predict_question(
|
||||
sample_context, sample_tokens, sample_ner, sample_srl, sample_q_type
|
||||
)
|
||||
|
||||
# Tokenize for BLEU score
|
||||
actual_tokens = actual_question.split()
|
||||
pred_tokens = pred_question.split()
|
||||
|
||||
# Calculate BLEU score
|
||||
# Using unigram, bigram, trigram, and 4-gram
|
||||
print("kaliamt aktual", actual_tokens)
|
||||
print("kaliamt prediksi", pred_tokens)
|
||||
bleu_score = sentence_bleu([actual_tokens], pred_tokens)
|
||||
bleu_scores.append(bleu_score)
|
||||
|
||||
try:
|
||||
rouge_scores = scorer.score(actual_question, pred_question)
|
||||
|
||||
# Extract F1 scores
|
||||
rouge1_scores.append(rouge_scores["rouge1"].fmeasure)
|
||||
rouge2_scores.append(rouge_scores["rouge2"].fmeasure)
|
||||
rougel_scores.append(rouge_scores["rougeL"].fmeasure)
|
||||
except Exception as e:
|
||||
print(f"Error calculating ROUGE score: {e}")
|
||||
|
||||
# Calculate average scores
|
||||
results = {
|
||||
"avg_bleu_score": np.mean(bleu_scores),
|
||||
"avg_rouge1": np.mean(rouge1_scores),
|
||||
"avg_rouge2": np.mean(rouge2_scores),
|
||||
"avg_rougel": np.mean(rougel_scores),
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
loaded_model = load_model("question_prediction_model_final.h5")
|
||||
|
||||
with open("question_prediction_tokenizers.json", "r") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
# Ambil beberapa sampel dari data test
|
||||
sample_idx = random.randint(0, len(test_indices) - 1)
|
||||
sample_context = contexts[test_indices[sample_idx]]
|
||||
sample_tokens = tokens_list[test_indices[sample_idx]]
|
||||
sample_ner = ner_list[test_indices[sample_idx]]
|
||||
sample_srl = srl_list[test_indices[sample_idx]]
|
||||
sample_q_type = q_types[test_indices[sample_idx]]
|
||||
|
||||
performance_metrics = evaluate_model_performance(test_indices)
|
||||
|
||||
print("\nModel Performance Metrics:")
|
||||
print(f"Average BLEU Score: {performance_metrics['avg_bleu_score']:.4f}")
|
||||
print(f"Average ROUGE-1 Score: {performance_metrics['avg_rouge1']:.4f}")
|
||||
print(f"Average ROUGE-2 Score: {performance_metrics['avg_rouge2']:.4f}")
|
||||
print(f"Average ROUGE-L Score: {performance_metrics['avg_rougel']:.4f}")
|
|
@ -0,0 +1,210 @@
|
|||
import numpy as np
|
||||
import json
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import load_model
|
||||
import re
|
||||
|
||||
class QuestionPredictionModel:
|
||||
def __init__(self, model_path, tokenizer_path):
|
||||
"""
|
||||
Initialize question prediction model with pre-trained model and tokenizers
|
||||
"""
|
||||
# Load model
|
||||
self.model = load_model(model_path)
|
||||
|
||||
# Load tokenizers
|
||||
with open(tokenizer_path, 'r') as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
# Reconstruct tokenizers
|
||||
self.word_tokenizer = tokenizer_from_json(tokenizer_data['word_tokenizer'])
|
||||
self.ner_tokenizer = tokenizer_from_json(tokenizer_data['ner_tokenizer'])
|
||||
self.srl_tokenizer = tokenizer_from_json(tokenizer_data['srl_tokenizer'])
|
||||
self.q_type_tokenizer = tokenizer_from_json(tokenizer_data['q_type_tokenizer'])
|
||||
|
||||
# Get max lengths
|
||||
self.max_context_len = tokenizer_data['max_context_len']
|
||||
self.max_answer_len = tokenizer_data['max_answer_len']
|
||||
self.max_question_len = tokenizer_data['max_question_len']
|
||||
self.max_token_len = tokenizer_data['max_token_len']
|
||||
|
||||
# Get vocabulary sizes
|
||||
self.vocab_size = len(self.word_tokenizer.word_index) + 1
|
||||
self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
|
||||
|
||||
def preprocess_text(self, text):
|
||||
"""Basic text preprocessing"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def predict_question(self, context, answer, tokens, ner, srl, q_type):
|
||||
"""
|
||||
Predict a question based on given context, answer, tokens, NER, SRL, and question type
|
||||
|
||||
Args:
|
||||
context (str): The context text
|
||||
answer (str): The answer to generate a question for
|
||||
tokens (list): List of tokens
|
||||
ner (list): List of NER tags corresponding to tokens
|
||||
srl (list): List of SRL tags corresponding to tokens
|
||||
q_type (str): Question type ('isian', 'opsi', or 'true_false')
|
||||
|
||||
Returns:
|
||||
str: The predicted question
|
||||
"""
|
||||
# Preprocess inputs
|
||||
context = self.preprocess_text(context)
|
||||
answer = self.preprocess_text(answer)
|
||||
|
||||
# Convert to sequences
|
||||
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
|
||||
answer_seq = self.word_tokenizer.texts_to_sequences([answer])[0]
|
||||
tokens_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
# Pad sequences
|
||||
context_padded = pad_sequences([context_seq], maxlen=self.max_context_len, padding="post")
|
||||
answer_padded = pad_sequences([answer_seq], maxlen=self.max_answer_len, padding="post")
|
||||
tokens_padded = pad_sequences([tokens_seq], maxlen=self.max_token_len, padding="post")
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
|
||||
|
||||
# One-hot encode question type
|
||||
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_categorical = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=self.q_type_vocab_size
|
||||
)
|
||||
|
||||
# Make prediction
|
||||
predicted_seq = self.model.predict(
|
||||
[context_padded, answer_padded, tokens_padded, ner_padded, srl_padded, q_type_categorical]
|
||||
)
|
||||
|
||||
# Convert predictions to tokens (taking the highest probability token at each position)
|
||||
predicted_indices = np.argmax(predicted_seq[0], axis=1)
|
||||
|
||||
# Create reversed word index for converting indices back to words
|
||||
reverse_word_index = {v: k for k, v in self.word_tokenizer.word_index.items()}
|
||||
|
||||
# Convert indices to words
|
||||
predicted_words = []
|
||||
for idx in predicted_indices:
|
||||
if idx != 0: # Skip padding tokens
|
||||
predicted_words.append(reverse_word_index.get(idx, ''))
|
||||
|
||||
# Form the question
|
||||
predicted_question = ' '.join(predicted_words)
|
||||
|
||||
# Add "___" to the end based on question type convention
|
||||
if "___" not in predicted_question:
|
||||
predicted_question += " ___"
|
||||
|
||||
return predicted_question
|
||||
|
||||
def batch_predict_questions(self, data):
|
||||
"""
|
||||
Predict questions for a batch of data
|
||||
|
||||
Args:
|
||||
data (list): List of dictionaries with context, tokens, ner, srl, and answers
|
||||
|
||||
Returns:
|
||||
list: List of predicted questions
|
||||
"""
|
||||
results = []
|
||||
|
||||
for item in data:
|
||||
context = item["context"]
|
||||
tokens = item["tokens"]
|
||||
ner = item["ner"]
|
||||
srl = item["srl"]
|
||||
|
||||
# If there are Q&A pairs, use them for evaluation
|
||||
if "qas" in item:
|
||||
for qa in item["qas"]:
|
||||
answer = qa["answer"]
|
||||
q_type = qa["type"]
|
||||
ground_truth = qa["question"]
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, answer, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append({
|
||||
"context": context,
|
||||
"answer": answer,
|
||||
"predicted_question": predicted_question,
|
||||
"ground_truth": ground_truth,
|
||||
"question_type": q_type
|
||||
})
|
||||
else:
|
||||
# If no Q&A pairs, generate questions for all question types
|
||||
for q_type in ["isian", "true_false", "opsi"]:
|
||||
# For demo purposes, use a placeholder answer (would need actual answers in real use)
|
||||
# In practice, you might extract potential answers from the context
|
||||
placeholders = {
|
||||
"isian": "placeholder",
|
||||
"true_false": "true",
|
||||
"opsi": "placeholder"
|
||||
}
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, placeholders[q_type], tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append({
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"question_type": q_type
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Load test data
|
||||
with open("data_converted.json", "r") as f:
|
||||
test_data = json.load(f)
|
||||
|
||||
# Initialize model
|
||||
question_predictor = QuestionPredictionModel(
|
||||
model_path="question_prediction_model_final.h5",
|
||||
tokenizer_path="question_prediction_tokenizers.json"
|
||||
)
|
||||
|
||||
# Example single prediction
|
||||
sample = test_data[0]
|
||||
context = sample["context"]
|
||||
tokens = sample["tokens"]
|
||||
ner = sample["ner"]
|
||||
srl = sample["srl"]
|
||||
answer = sample["qas"][0]["answer"]
|
||||
q_type = sample["qas"][0]["type"]
|
||||
|
||||
predicted_question = question_predictor.predict_question(
|
||||
context, answer, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
print(f"Context: {context}")
|
||||
print(f"Answer: {answer}")
|
||||
print(f"Question Type: {q_type}")
|
||||
print(f"Predicted Question: {predicted_question}")
|
||||
print(f"Ground Truth: {sample['qas'][0]['question']}")
|
||||
|
||||
# Batch prediction
|
||||
results = question_predictor.batch_predict_questions(test_data[:3])
|
||||
|
||||
print("\nBatch Results:")
|
||||
for i, result in enumerate(results):
|
||||
print(f"\nResult {i+1}:")
|
||||
print(f"Context: {result['context']}")
|
||||
print(f"Answer: {result.get('answer', 'N/A')}")
|
||||
print(f"Question Type: {result['question_type']}")
|
||||
print(f"Predicted Question: {result['predicted_question']}")
|
||||
if 'ground_truth' in result:
|
||||
print(f"Ground Truth: {result['ground_truth']}")
|
|
@ -0,0 +1,188 @@
|
|||
import numpy as np
|
||||
import json
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.text import tokenizer_from_json
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.models import load_model
|
||||
import re
|
||||
|
||||
|
||||
class QuestionPredictionModel:
|
||||
def __init__(self, model_path, tokenizer_path):
|
||||
"""
|
||||
Initialize question prediction model with pre-trained model and tokenizers
|
||||
"""
|
||||
# Load model
|
||||
self.model = load_model(model_path)
|
||||
|
||||
# Load tokenizers
|
||||
with open(tokenizer_path, "r") as f:
|
||||
tokenizer_data = json.load(f)
|
||||
|
||||
# Reconstruct tokenizers
|
||||
self.word_tokenizer = tokenizer_from_json(tokenizer_data["word_tokenizer"])
|
||||
self.ner_tokenizer = tokenizer_from_json(tokenizer_data["ner_tokenizer"])
|
||||
self.srl_tokenizer = tokenizer_from_json(tokenizer_data["srl_tokenizer"])
|
||||
self.q_type_tokenizer = tokenizer_from_json(tokenizer_data["q_type_tokenizer"])
|
||||
|
||||
# Get max lengths
|
||||
self.max_context_len = tokenizer_data["max_context_len"]
|
||||
self.max_question_len = tokenizer_data["max_question_len"]
|
||||
self.max_token_len = tokenizer_data["max_token_len"]
|
||||
|
||||
# Get vocabulary sizes
|
||||
self.vocab_size = len(self.word_tokenizer.word_index) + 1
|
||||
self.q_type_vocab_size = len(self.q_type_tokenizer.word_index) + 1
|
||||
|
||||
def preprocess_text(self, text):
|
||||
"""Basic text preprocessing"""
|
||||
text = text.lower()
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def predict_question(self, context, tokens, ner, srl, q_type):
|
||||
"""Prediksi pertanyaan berdasarkan konteks dan fitur lainnya"""
|
||||
# Preprocess
|
||||
context = self.preprocess_text(context)
|
||||
|
||||
# Convert to sequences
|
||||
context_seq = self.word_tokenizer.texts_to_sequences([context])[0]
|
||||
token_seq = self.word_tokenizer.texts_to_sequences([" ".join(tokens)])[0]
|
||||
ner_seq = self.ner_tokenizer.texts_to_sequences([" ".join(ner)])[0]
|
||||
srl_seq = self.srl_tokenizer.texts_to_sequences([" ".join(srl)])[0]
|
||||
|
||||
# Pad sequences
|
||||
context_padded = pad_sequences(
|
||||
[context_seq], maxlen=self.max_context_len, padding="post"
|
||||
)
|
||||
token_padded = pad_sequences(
|
||||
[token_seq], maxlen=self.max_token_len, padding="post"
|
||||
)
|
||||
ner_padded = pad_sequences([ner_seq], maxlen=self.max_token_len, padding="post")
|
||||
srl_padded = pad_sequences([srl_seq], maxlen=self.max_token_len, padding="post")
|
||||
|
||||
# Q-type one-hot encoding
|
||||
q_type_idx = self.q_type_tokenizer.word_index.get(q_type, 0)
|
||||
q_type_one_hot = tf.keras.utils.to_categorical(
|
||||
[q_type_idx], num_classes=self.q_type_vocab_size
|
||||
)
|
||||
|
||||
# Predict
|
||||
pred = self.model.predict(
|
||||
[context_padded, token_padded, ner_padded, srl_padded, q_type_one_hot]
|
||||
)
|
||||
|
||||
# Convert prediction to words
|
||||
pred_seq = np.argmax(pred[0], axis=1)
|
||||
|
||||
# Convert indices to words
|
||||
reverse_word_map = {v: k for k, v in self.word_tokenizer.word_index.items()}
|
||||
pred_words = [reverse_word_map.get(i, "") for i in pred_seq if i != 0]
|
||||
|
||||
return " ".join(pred_words)
|
||||
|
||||
def batch_predict_questions(self, data):
|
||||
"""
|
||||
Predict questions for a batch of data
|
||||
|
||||
Args:
|
||||
data (list): List of dictionaries with context, tokens, ner, srl, and answers
|
||||
|
||||
Returns:
|
||||
list: List of predicted questions
|
||||
"""
|
||||
results = []
|
||||
|
||||
for item in data:
|
||||
context = item["context"]
|
||||
tokens = item["tokens"]
|
||||
ner = item["ner"]
|
||||
srl = item["srl"]
|
||||
|
||||
# If there are Q&A pairs, use them for evaluation
|
||||
if "qas" in item:
|
||||
for qa in item["qas"]:
|
||||
q_type = qa["type"]
|
||||
ground_truth = qa["question"]
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"ground_truth": ground_truth,
|
||||
"question_type": q_type,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If no Q&A pairs, generate questions for all question types
|
||||
for q_type in ["isian", "true_false", "opsi"]:
|
||||
# For demo purposes, use a placeholder answer (would need actual answers in real use)
|
||||
# In practice, you might extract potential answers from the context
|
||||
placeholders = {
|
||||
"isian": "placeholder",
|
||||
"true_false": "true",
|
||||
"opsi": "placeholder",
|
||||
}
|
||||
|
||||
predicted_question = self.predict_question(
|
||||
context, placeholders[q_type], tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"context": context,
|
||||
"predicted_question": predicted_question,
|
||||
"question_type": q_type,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Load test data
|
||||
with open("../dataset/conteks_question.json", "r") as f:
|
||||
test_data = json.load(f)
|
||||
|
||||
# Initialize model
|
||||
question_predictor = QuestionPredictionModel(
|
||||
model_path="question_prediction_model_final.h5",
|
||||
tokenizer_path="question_prediction_tokenizers.json",
|
||||
)
|
||||
|
||||
# Example single prediction
|
||||
sample = test_data[1]
|
||||
context = sample["context"]
|
||||
tokens = sample["tokens"]
|
||||
ner = sample["ner"]
|
||||
srl = sample["srl"]
|
||||
answer = sample["qas"][0]["answer"]
|
||||
q_type = sample["qas"][0]["type"]
|
||||
|
||||
predicted_question = question_predictor.predict_question(
|
||||
context, tokens, ner, srl, q_type
|
||||
)
|
||||
|
||||
print(f"Context: {context}")
|
||||
print(f"Answer: {answer}")
|
||||
print(f"Question Type: {q_type}")
|
||||
print(f"Predicted Question: {predicted_question}")
|
||||
print(f"Ground Truth: {sample['qas'][0]['question']}")
|
||||
|
||||
# Batch prediction
|
||||
# results = question_predictor.batch_predict_questions(test_data[:3])
|
||||
|
||||
# print("\nBatch Results:")
|
||||
# for i, result in enumerate(results):
|
||||
# print(f"\nResult {i+1}:")
|
||||
# print(f"Context: {result['context']}")
|
||||
# print(f"Answer: {result.get('answer', 'N/A')}")
|
||||
# print(f"Question Type: {result['question_type']}")
|
||||
# print(f"Predicted Question: {result['predicted_question']}")
|
||||
# if "ground_truth" in result:
|
||||
# print(f"Ground Truth: {result['ground_truth']}")
|
After Width: | Height: | Size: 53 KiB |
After Width: | Height: | Size: 51 KiB |
After Width: | Height: | Size: 88 KiB |
|
@ -0,0 +1,389 @@
|
|||
# ===============================================================
|
||||
# Seq2Seq‑LSTM + Luong Attention untuk Question‑Answer Generator
|
||||
# + Greedy & Beam Search decoding + BLEU‑4 evaluation
|
||||
# ===============================================================
|
||||
# • Semua embedding mask_zero=True (padding di‑mask)
|
||||
# • Encoder = Bidirectional LSTM (return_sequences=True)
|
||||
# • Decoder = LSTM + Luong Attention (keras.layers.Attention).
|
||||
# • Greedy & beam‑search inference sub‑model dibangun terpisah (encoder,
|
||||
# decoder‑Q‑step, decoder‑A‑step).
|
||||
# • BLEU score (nltk.corpus_bleu) untuk evaluasi pertanyaan & jawaban.
|
||||
# ---------------------------------------------------------------
|
||||
# PETUNJUK
|
||||
# 1. pip install nltk
|
||||
# 2. python seq2seq_qa_attention.py # train + simpan model
|
||||
# 3. jalankan fungsi evaluate_bleu() # hitung BLEU di validation/test
|
||||
# ===============================================================
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from itertools import chain
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.layers import (
|
||||
Input, Embedding, LSTM, Bidirectional, Concatenate,
|
||||
Dense, TimeDistributed, Attention
|
||||
)
|
||||
from tensorflow.keras.models import Model
|
||||
from nltk.translate.bleu_score import corpus_bleu # pip install nltk
|
||||
|
||||
# ----------------------- 1. Load & flatten data ----------------------------
|
||||
RAW = json.loads(Path("../dataset/dev_dataset_test.json").read_text())
|
||||
|
||||
samples = []
|
||||
for item in RAW:
|
||||
for qp in item["quiz_posibility"]:
|
||||
samp = {
|
||||
"tokens": [t.lower() for t in item["tokens"]],
|
||||
"ner": item["ner"],
|
||||
"srl": item["srl"],
|
||||
"q_type": qp["type"],
|
||||
"q_toks": [t.lower() for t in qp["question"]] + ["<eos>"],
|
||||
}
|
||||
if isinstance(qp["answer"], list):
|
||||
samp["a_toks"] = [t.lower() for t in qp["answer"]] + ["<eos>"]
|
||||
else:
|
||||
samp["a_toks"] = [qp["answer"].lower(), "<eos>"]
|
||||
samples.append(samp)
|
||||
|
||||
print("Total flattened samples:", len(samples))
|
||||
|
||||
# ----------------------- 2. Build vocabularies -----------------------------
|
||||
|
||||
def build_vocab(seq_iter, reserved=("<pad>", "<unk>", "<sos>", "<eos>")):
|
||||
vocab = {tok: idx for idx, tok in enumerate(reserved)}
|
||||
for tok in chain.from_iterable(seq_iter):
|
||||
if tok not in vocab:
|
||||
vocab[tok] = len(vocab)
|
||||
return vocab
|
||||
|
||||
v_tok = build_vocab((s["tokens"] for s in samples))
|
||||
v_ner = build_vocab((s["ner"] for s in samples), reserved=("<pad>", "<unk>"))
|
||||
v_srl = build_vocab((s["srl"] for s in samples), reserved=("<pad>", "<unk>"))
|
||||
v_q = build_vocab((s["q_toks"] for s in samples))
|
||||
v_a = build_vocab((s["a_toks"] for s in samples))
|
||||
v_typ = {"isian": 0, "opsi": 1, "true_false": 2}
|
||||
|
||||
iv_q = {i: t for t, i in v_q.items()}
|
||||
iv_a = {i: t for t, i in v_a.items()}
|
||||
|
||||
# ----------------------- 3. Vectorise + pad -------------------------------
|
||||
|
||||
def encode(seq, vmap):
|
||||
return [vmap.get(tok, vmap["<unk>"]) for tok in seq]
|
||||
|
||||
MAX_SENT = max(len(s["tokens"]) for s in samples)
|
||||
MAX_Q = max(len(s["q_toks"]) for s in samples)
|
||||
MAX_A = max(len(s["a_toks"]) for s in samples)
|
||||
|
||||
X_tok_ids = pad_sequences([encode(s["tokens"], v_tok) for s in samples],
|
||||
maxlen=MAX_SENT, padding="post")
|
||||
X_ner_ids = pad_sequences([encode(s["ner"], v_ner) for s in samples],
|
||||
maxlen=MAX_SENT, padding="post")
|
||||
X_srl_ids = pad_sequences([encode(s["srl"], v_srl) for s in samples],
|
||||
maxlen=MAX_SENT, padding="post")
|
||||
|
||||
q_in_ids = pad_sequences([[v_q["<sos>"], *encode(s["q_toks"][:-1], v_q)]
|
||||
for s in samples], maxlen=MAX_Q, padding="post")
|
||||
q_out_ids = pad_sequences([encode(s["q_toks"], v_q) for s in samples],
|
||||
maxlen=MAX_Q, padding="post")
|
||||
|
||||
a_in_ids = pad_sequences([[v_a["<sos>"], *encode(s["a_toks"][:-1], v_a)]
|
||||
for s in samples], maxlen=MAX_A, padding="post")
|
||||
a_out_ids = pad_sequences([encode(s["a_toks"], v_a) for s in samples],
|
||||
maxlen=MAX_A, padding="post")
|
||||
|
||||
y_type_ids = np.array([v_typ[s["q_type"]] for s in samples])
|
||||
|
||||
# ----------------------- 4. Hyper‑params ----------------------------------
|
||||
d_tok = 32 # token embedding dim
|
||||
d_tag = 16 # NER / SRL embedding dim
|
||||
units = 64 # per direction of BiLSTM
|
||||
lat_dim = units * 2
|
||||
|
||||
# ----------------------- 5. Build model -----------------------------------
|
||||
# Encoder ----------------------------------------------------------
|
||||
|
||||
tok_in = Input((MAX_SENT,), dtype="int32", name="tok_in")
|
||||
ner_in = Input((MAX_SENT,), dtype="int32", name="ner_in")
|
||||
srl_in = Input((MAX_SENT,), dtype="int32", name="srl_in")
|
||||
|
||||
emb_tok = Embedding(len(v_tok), d_tok, mask_zero=True, name="emb_tok")(tok_in)
|
||||
emb_ner = Embedding(len(v_ner), d_tag, mask_zero=True, name="emb_ner")(ner_in)
|
||||
emb_srl = Embedding(len(v_srl), d_tag, mask_zero=True, name="emb_srl")(srl_in)
|
||||
|
||||
enc_concat = Concatenate(name="enc_concat")([emb_tok, emb_ner, emb_srl])
|
||||
bi_lstm = Bidirectional(LSTM(units, return_sequences=True, return_state=True),
|
||||
name="encoder_bi_lstm")
|
||||
enc_seq, f_h, f_c, b_h, b_c = bi_lstm(enc_concat)
|
||||
enc_h = Concatenate()( [f_h, b_h] ) # (B, lat_dim)
|
||||
enc_c = Concatenate()( [f_c, b_c] )
|
||||
|
||||
# Decoder – QUESTION ----------------------------------------------
|
||||
q_in = Input((MAX_Q,), dtype="int32", name="q_in")
|
||||
# 💡 mask_zero=False supaya Attention tidak bentrok dengan mask encoder
|
||||
q_emb = Embedding(len(v_q), d_tok, mask_zero=False, name="q_emb")(q_in)
|
||||
|
||||
dec_q_lstm = LSTM(lat_dim, return_sequences=True, return_state=True,
|
||||
name="decoder_q_lstm")
|
||||
q_seq, q_h, q_c = dec_q_lstm(q_emb, initial_state=[enc_h, enc_c])
|
||||
|
||||
enc_proj_q = TimeDistributed(Dense(lat_dim), name="enc_proj_q")(enc_seq)
|
||||
attn_q = Attention(name="attn_q")([q_seq, enc_proj_q])
|
||||
q_concat = Concatenate(name="q_concat")([q_seq, attn_q])
|
||||
q_out = TimeDistributed(Dense(len(v_q), activation="softmax"), name="q_out")(q_concat)
|
||||
|
||||
# Decoder – ANSWER -------------------------------------------------
|
||||
a_in = Input((MAX_A,), dtype="int32", name="a_in")
|
||||
# juga mask_zero=False
|
||||
a_emb = Embedding(len(v_a), d_tok, mask_zero=False, name="a_emb")(a_in)
|
||||
|
||||
dec_a_lstm = LSTM(lat_dim, return_sequences=True, return_state=True,
|
||||
name="decoder_a_lstm")
|
||||
a_seq, _, _ = dec_a_lstm(a_emb, initial_state=[q_h, q_c])
|
||||
|
||||
enc_proj_a = TimeDistributed(Dense(lat_dim), name="enc_proj_a")(enc_seq)
|
||||
attn_a = Attention(name="attn_a")([a_seq, enc_proj_a])
|
||||
a_concat = Concatenate(name="a_concat")([a_seq, attn_a])
|
||||
a_out = TimeDistributed(Dense(len(v_a), activation="softmax"), name="a_out")(a_concat)
|
||||
|
||||
# Classifier -------------------------------------------------------
|
||||
type_dense = Dense(len(v_typ), activation="softmax", name="type_out")(enc_h)
|
||||
|
||||
model = Model(inputs=[tok_in, ner_in, srl_in, q_in, a_in],
|
||||
outputs=[q_out, a_out, type_dense])
|
||||
model.summary()
|
||||
|
||||
# ----------------------- 6. Compile & train ------------------------------
|
||||
losses = {
|
||||
"q_out": "sparse_categorical_crossentropy",
|
||||
"a_out": "sparse_categorical_crossentropy",
|
||||
"type_out": "sparse_categorical_crossentropy",
|
||||
}
|
||||
loss_weights = {"q_out": 1.0, "a_out": 1.0, "type_out": 0.3}
|
||||
|
||||
model.compile(optimizer="adam", loss=losses, loss_weights=loss_weights,
|
||||
metrics={"q_out": "sparse_categorical_accuracy",
|
||||
"a_out": "sparse_categorical_accuracy",
|
||||
"type_out": "accuracy"})
|
||||
|
||||
history = model.fit(
|
||||
[X_tok_ids, X_ner_ids, X_srl_ids, q_in_ids, a_in_ids],
|
||||
[q_out_ids, a_out_ids, y_type_ids],
|
||||
validation_split=0.1,
|
||||
epochs=30,
|
||||
batch_size=64,
|
||||
callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],
|
||||
verbose=1,
|
||||
)
|
||||
|
||||
model.save("seq2seq_attn.keras")
|
||||
print("Model saved to seq2seq_attn.keras")
|
||||
|
||||
# ----------------------- 7. Inference sub‑models --------------------------
|
||||
# Encoder model
|
||||
encoder_model = Model([tok_in, ner_in, srl_in], [enc_seq, enc_h, enc_c])
|
||||
|
||||
# Question decoder step model ------------------------------------------------
|
||||
# Inputs
|
||||
q_token_in = Input((1,), dtype="int32", name="q_token_in")
|
||||
enc_seq_in = Input((MAX_SENT, lat_dim), name="enc_seq_in")
|
||||
enc_proj_q_in = Input((MAX_SENT, lat_dim), name="enc_proj_q_in")
|
||||
state_h_in = Input((lat_dim,), name="state_h_in")
|
||||
state_c_in = Input((lat_dim,), name="state_c_in")
|
||||
|
||||
# Embedding
|
||||
q_emb_step = model.get_layer("q_emb")(q_token_in)
|
||||
|
||||
# LSTM (reuse weights)
|
||||
q_lstm_step, h_out, c_out = model.get_layer("decoder_q_lstm")(q_emb_step,
|
||||
initial_state=[state_h_in, state_c_in])
|
||||
# Attention
|
||||
attn_step = model.get_layer("attn_q")([q_lstm_step, enc_proj_q_in])
|
||||
q_concat_step = Concatenate()([q_lstm_step, attn_step])
|
||||
q_logits_step = model.get_layer("q_out")(q_concat_step)
|
||||
|
||||
decoder_q_step = Model([q_token_in, enc_seq_in, enc_proj_q_in, state_h_in, state_c_in],
|
||||
[q_logits_step, h_out, c_out])
|
||||
|
||||
# Answer decoder step model --------------------------------------------------
|
||||
a_token_in = Input((1,), dtype="int32", name="a_token_in")
|
||||
enc_proj_a_in = Input((MAX_SENT, lat_dim), name="enc_proj_a_in")
|
||||
state_h_a_in = Input((lat_dim,), name="state_h_a_in")
|
||||
state_c_a_in = Input((lat_dim,), name="state_c_a_in")
|
||||
|
||||
# Embedding reuse
|
||||
a_emb_step = model.get_layer("a_emb")(a_token_in)
|
||||
|
||||
# LSTM reuse
|
||||
a_lstm_step, h_a_out, c_a_out = model.get_layer("decoder_a_lstm")(a_emb_step,
|
||||
initial_state=[state_h_a_in, state_c_a_in])
|
||||
# Attention reuse
|
||||
attn_a_step = model.get_layer("attn_a")([a_lstm_step, enc_proj_a_in])
|
||||
a_concat_step = Concatenate()([a_lstm_step, attn_a_step])
|
||||
a_logits_step = model.get_layer("a_out")(a_concat_step)
|
||||
|
||||
decoder_a_step = Model([a_token_in, enc_proj_a_in, state_h_a_in, state_c_a_in],
|
||||
[a_logits_step, h_a_out, c_a_out])
|
||||
|
||||
# ----------------------- 8. Decoding helpers ------------------------------
|
||||
|
||||
def encode_and_pad(seq, vmap, max_len):
|
||||
ids = encode(seq, vmap)
|
||||
return ids + [vmap["<pad>"]] * (max_len - len(ids))
|
||||
|
||||
|
||||
def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):
|
||||
"""Return generated (question_tokens, answer_tokens, q_type_str)"""
|
||||
# --- encoder ---------------------------------------------------------
|
||||
enc_tok = np.array([encode_and_pad(tokens, v_tok, MAX_SENT)])
|
||||
enc_ner = np.array([encode_and_pad(ner, v_ner, MAX_SENT)])
|
||||
enc_srl = np.array([encode_and_pad(srl, v_srl, MAX_SENT)])
|
||||
|
||||
enc_seq_val, h, c = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)
|
||||
enc_proj_q_val = model.get_layer("enc_proj_q")(enc_seq_val)
|
||||
enc_proj_a_val = model.get_layer("enc_proj_a")(enc_seq_val)
|
||||
|
||||
# --- greedy Question --------------------------------------------------
|
||||
q_ids = []
|
||||
tgt = np.array([[v_q["<sos>"]]])
|
||||
for _ in range(max_q):
|
||||
logits, h, c = decoder_q_step.predict([tgt, enc_seq_val, enc_proj_q_val, h, c], verbose=0)
|
||||
next_id = int(logits[0, 0].argmax())
|
||||
if next_id == v_q["<eos>"]:
|
||||
break
|
||||
q_ids.append(next_id)
|
||||
tgt = np.array([[next_id]])
|
||||
|
||||
# --- reset state for Answer -------------------------------------------
|
||||
# Use last q_h, q_c (already in h,c)
|
||||
a_ids = []
|
||||
tgt_a = np.array([[v_a["<sos>"]]])
|
||||
for _ in range(max_a):
|
||||
logits_a, h, c = decoder_a_step.predict([tgt_a, enc_proj_a_val, h, c], verbose=0)
|
||||
next_a = int(logits_a[0, 0].argmax())
|
||||
if next_a == v_a["<eos>"]:
|
||||
break
|
||||
a_ids.append(next_a)
|
||||
tgt_a = np.array([[next_a]])
|
||||
|
||||
# Question type
|
||||
typ_logits = model.predict([enc_tok, enc_ner, enc_srl, np.zeros((1, MAX_Q)), np.zeros((1, MAX_A))], verbose=0)[2]
|
||||
typ_id = int(typ_logits.argmax())
|
||||
q_type = [k for k, v in v_typ.items() if v == typ_id][0]
|
||||
|
||||
question = [iv_q.get(i, "<unk>") for i in q_ids]
|
||||
answer = [iv_a.get(i, "<unk>") for i in a_ids]
|
||||
return question, answer, q_type
|
||||
|
||||
|
||||
def beam_decode(tokens, ner, srl, beam_width=5, max_q=20, max_a=10):
|
||||
"""Beam search decoding. Returns best (question_tokens, answer_tokens, q_type)"""
|
||||
enc_tok = np.array([encode_and_pad(tokens, v_tok, MAX_SENT)])
|
||||
enc_ner = np.array([encode_and_pad(ner, v_ner, MAX_SENT)])
|
||||
enc_srl = np.array([encode_and_pad(srl, v_srl, MAX_SENT)])
|
||||
enc_seq_val, h0, c0 = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)
|
||||
enc_proj_q_val = model.get_layer("enc_proj_q")(enc_seq_val)
|
||||
enc_proj_a_val = model.get_layer("enc_proj_a")(enc_seq_val)
|
||||
|
||||
# ----- Beam for Question ----------------------------------------------
|
||||
Beam = [( [v_q["<sos>"]], 0.0, h0, c0 )] # (sequence, logP, h, c)
|
||||
completed_q = []
|
||||
for _ in range(max_q):
|
||||
new_beam = []
|
||||
for seq, logp, h, c in Beam:
|
||||
tgt = np.array([[seq[-1]]])
|
||||
logits, next_h, next_c = decoder_q_step.predict([tgt, enc_seq_val, enc_proj_q_val, h, c], verbose=0)
|
||||
log_probs = np.log(logits[0, 0] + 1e-8)
|
||||
top_ids = np.argsort(log_probs)[-beam_width:]
|
||||
for nid in top_ids:
|
||||
new_seq = seq + [int(nid)]
|
||||
new_logp = logp + log_probs[nid]
|
||||
new_beam.append( (new_seq, new_logp, next_h, next_c) )
|
||||
# keep best beam_width
|
||||
Beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]
|
||||
# move completed
|
||||
Beam, done = [], Beam # placeholder copy to modify
|
||||
for seq, logp, h, c in done:
|
||||
if seq[-1] == v_q["<eos>"] or len(seq) >= max_q:
|
||||
completed_q.append( (seq, logp, h, c) )
|
||||
else:
|
||||
Beam.append( (seq, logp, h, c) )
|
||||
if not Beam:
|
||||
break
|
||||
if completed_q:
|
||||
best_q = max(completed_q, key=lambda x: x[1])
|
||||
else:
|
||||
best_q = max(Beam, key=lambda x: x[1])
|
||||
|
||||
q_seq_ids, _, h_q, c_q = best_q
|
||||
q_ids = [i for i in q_seq_ids[1:] if i != v_q["<eos>"]]
|
||||
|
||||
# ----- Beam for Answer --------------------------------------------------
|
||||
Beam = [( [v_a["<sos>"]], 0.0, h_q, c_q )]
|
||||
completed_a = []
|
||||
for _ in range(max_a):
|
||||
new_beam = []
|
||||
for seq, logp, h, c in Beam:
|
||||
tgt = np.array([[seq[-1]]])
|
||||
logits, next_h, next_c = decoder_a_step.predict([tgt, enc_proj_a_val, h, c], verbose=0)
|
||||
log_probs = np.log(logits[0, 0] + 1e-8)
|
||||
top_ids = np.argsort(log_probs)[-beam_width:]
|
||||
for nid in top_ids:
|
||||
new_seq = seq + [int(nid)]
|
||||
new_logp = logp + log_probs[nid]
|
||||
new_beam.append( (new_seq, new_logp, next_h, next_c) )
|
||||
Beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]
|
||||
Beam, done = [], Beam
|
||||
for seq, logp, h, c in done:
|
||||
if seq[-1] == v_a["<eos>"] or len(seq) >= max_a:
|
||||
completed_a.append( (seq, logp) )
|
||||
else:
|
||||
Beam.append( (seq, logp, h, c) )
|
||||
if not Beam:
|
||||
break
|
||||
if completed_a:
|
||||
best_a_seq, _ = max(completed_a, key=lambda x: x[1])
|
||||
else:
|
||||
best_a_seq, _ = max(Beam, key=lambda x: x[1])
|
||||
a_ids = [i for i in best_a_seq[1:] if i != v_a["<eos>"]]
|
||||
|
||||
# Question type classification
|
||||
typ_logits = model.predict([enc_tok, enc_ner, enc_srl, np.zeros((1, MAX_Q)), np.zeros((1, MAX_A))], verbose=0)[2]
|
||||
typ_id = int(typ_logits.argmax())
|
||||
q_type = [k for k, v in v_typ.items() if v == typ_id][0]
|
||||
|
||||
question = [iv_q.get(i, "<unk>") for i in q_ids]
|
||||
answer = [iv_a.get(i, "<unk>") for i in a_ids]
|
||||
|
||||
return question, answer, q_type
|
||||
|
||||
# ----------------------- 9. BLEU evaluation -------------------------------
|
||||
|
||||
def evaluate_bleu(split_ratio=0.1, beam=False):
|
||||
"""Compute corpus BLEU‑4 on hold‑out split."""
|
||||
n_total = len(samples)
|
||||
n_val = int(n_total * split_ratio)
|
||||
idxs = np.random.choice(n_total, n_val, replace=False)
|
||||
|
||||
refs_q, hyps_q = [], []
|
||||
refs_a, hyps_a = [], []
|
||||
|
||||
for i in idxs:
|
||||
s = samples[i]
|
||||
question_pred, answer_pred, _ = (beam_decode if beam else greedy_decode)(
|
||||
s["tokens"], s["ner"], s["srl"],
|
||||
)
|
||||
refs_q.append([s["q_toks"][:-1]]) # exclude <eos>
|
||||
hyps_q.append(question_pred)
|
||||
refs_a.append([s["a_toks"][:-1]])
|
||||
hyps_a.append(answer_pred)
|
||||
|
||||
bleu_q = corpus_bleu(refs_q, hyps_q)
|
||||
bleu_a = corpus_bleu(refs_a, hyps_a)
|
||||
print(f"BLEU‑4 Question: {bleu_q:.3f}\nBLEU‑4 Answer : {bleu_a:.3f}")
|
||||
|
||||
# Example usage (uncomment):
|
||||
evaluate_bleu(beam=False)
|
||||
evaluate_bleu(beam=True)
|