feat: adding qc model and adding dataset
This commit is contained in:
parent
3a04f94fb3
commit
1c270d4e75
Binary file not shown.
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 118,
|
||||
"execution_count": 13,
|
||||
"id": "fb106e20",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -19,7 +19,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"execution_count": 14,
|
||||
"id": "00347a5f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -54,7 +54,6 @@
|
|||
" for line in f:\n",
|
||||
" line = line.strip()\n",
|
||||
" if not line:\n",
|
||||
" # Jika baris kosong → akhir kalimat\n",
|
||||
" if tokens:\n",
|
||||
" data.append({\n",
|
||||
" \"tokens\": tokens,\n",
|
||||
|
@ -82,7 +81,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 120,
|
||||
"execution_count": null,
|
||||
"id": "3793950a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "ac8eb374",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -103,7 +110,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 121,
|
||||
"execution_count": 16,
|
||||
"id": "80356f1f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -130,7 +137,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 122,
|
||||
"execution_count": 17,
|
||||
"id": "fe219c96",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -138,25 +145,25 @@
|
|||
"X_train, X_test, y_ner_train, y_ner_test, y_srl_train, y_srl_test = train_test_split(\n",
|
||||
" X, y_ner, y_srl, \n",
|
||||
" test_size=0.20, \n",
|
||||
" random_state=42, # supaya reproducible\n",
|
||||
" shuffle=True # acak baris\n",
|
||||
" random_state=42,\n",
|
||||
" shuffle=True \n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 123,
|
||||
"execution_count": 18,
|
||||
"id": "7a9636b6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_13\"</span>\n",
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_1\"</span>\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1mModel: \"functional_13\"\u001b[0m\n"
|
||||
"\u001b[1mModel: \"functional_1\"\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
|
@ -168,19 +175,19 @@
|
|||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ input_layer_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ input_layer_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">44,544</span> │ input_layer_13[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
||||
"│ embedding_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">44,544</span> │ input_layer_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ bidirectional_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">66,048</span> │ embedding_13[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
|
||||
"│ bidirectional_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">66,048</span> │ embedding_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Bidirectional</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">25</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,225</span> │ bidirectional_13… │\n",
|
||||
"│ ner_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">25</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,225</span> │ bidirectional_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">20</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,580</span> │ bidirectional_13… │\n",
|
||||
"│ srl_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">18</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,322</span> │ bidirectional_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
|
||||
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
|
||||
"</pre>\n"
|
||||
|
@ -189,19 +196,19 @@
|
|||
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
||||
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
|
||||
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
||||
"│ input_layer_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ input_layer_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
||||
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ embedding_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_13[\u001b[38;5;34m0\u001b[0m… │\n",
|
||||
"│ embedding_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_1[\u001b[38;5;34m0\u001b[0m]… │\n",
|
||||
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ bidirectional_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_13[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
|
||||
"│ bidirectional_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
|
||||
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_13… │\n",
|
||||
"│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_1[\u001b[38;5;34m…\u001b[0m │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
||||
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
||||
"│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m2,580\u001b[0m │ bidirectional_13… │\n",
|
||||
"│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m18\u001b[0m) │ \u001b[38;5;34m2,322\u001b[0m │ bidirectional_1[\u001b[38;5;34m…\u001b[0m │\n",
|
||||
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
||||
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
|
||||
]
|
||||
|
@ -212,11 +219,11 @@
|
|||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,397</span> (454.68 KB)\n",
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,139</span> (453.67 KB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n"
|
||||
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,139\u001b[0m (453.67 KB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
|
@ -225,11 +232,11 @@
|
|||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,397</span> (454.68 KB)\n",
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,139</span> (453.67 KB)\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n"
|
||||
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,139\u001b[0m (453.67 KB)\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
|
@ -253,25 +260,25 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 1/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.3010 - ner_output_accuracy: 0.8807 - ner_output_loss: 1.5617 - srl_output_accuracy: 0.7456 - srl_output_loss: 1.7393 - val_loss: 0.7284 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2466 - val_srl_output_accuracy: 0.8300 - val_srl_output_loss: 0.4818\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.2850 - ner_output_accuracy: 0.8700 - ner_output_loss: 1.6767 - srl_output_accuracy: 0.7518 - srl_output_loss: 1.6083 - val_loss: 0.7275 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2555 - val_srl_output_accuracy: 0.8450 - val_srl_output_loss: 0.4720\n",
|
||||
"Epoch 2/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7355 - ner_output_accuracy: 0.9569 - ner_output_loss: 0.2279 - srl_output_accuracy: 0.8297 - srl_output_loss: 0.5076 - val_loss: 0.6655 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2323 - val_srl_output_accuracy: 0.8506 - val_srl_output_loss: 0.4332\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7622 - ner_output_accuracy: 0.9528 - ner_output_loss: 0.2458 - srl_output_accuracy: 0.8296 - srl_output_loss: 0.5163 - val_loss: 0.6534 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2296 - val_srl_output_accuracy: 0.8531 - val_srl_output_loss: 0.4238\n",
|
||||
"Epoch 3/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7041 - ner_output_accuracy: 0.9522 - ner_output_loss: 0.2219 - srl_output_accuracy: 0.8488 - srl_output_loss: 0.4822 - val_loss: 0.6368 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2232 - val_srl_output_accuracy: 0.8744 - val_srl_output_loss: 0.4135\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6875 - ner_output_accuracy: 0.9572 - ner_output_loss: 0.2126 - srl_output_accuracy: 0.8496 - srl_output_loss: 0.4750 - val_loss: 0.6327 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2273 - val_srl_output_accuracy: 0.8688 - val_srl_output_loss: 0.4054\n",
|
||||
"Epoch 4/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6864 - ner_output_accuracy: 0.9520 - ner_output_loss: 0.2184 - srl_output_accuracy: 0.8548 - srl_output_loss: 0.4680 - val_loss: 0.6078 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2193 - val_srl_output_accuracy: 0.8769 - val_srl_output_loss: 0.3885\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.6103 - ner_output_accuracy: 0.9533 - ner_output_loss: 0.2114 - srl_output_accuracy: 0.8772 - srl_output_loss: 0.3988 - val_loss: 0.6009 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2137 - val_srl_output_accuracy: 0.8662 - val_srl_output_loss: 0.3872\n",
|
||||
"Epoch 5/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6304 - ner_output_accuracy: 0.9545 - ner_output_loss: 0.2009 - srl_output_accuracy: 0.8675 - srl_output_loss: 0.4295 - val_loss: 0.5727 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2015 - val_srl_output_accuracy: 0.8812 - val_srl_output_loss: 0.3711\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6757 - ner_output_accuracy: 0.9486 - ner_output_loss: 0.2281 - srl_output_accuracy: 0.8582 - srl_output_loss: 0.4476 - val_loss: 0.5690 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2040 - val_srl_output_accuracy: 0.8781 - val_srl_output_loss: 0.3650\n",
|
||||
"Epoch 6/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5679 - ner_output_accuracy: 0.9557 - ner_output_loss: 0.1749 - srl_output_accuracy: 0.8783 - srl_output_loss: 0.3930 - val_loss: 0.5471 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1956 - val_srl_output_accuracy: 0.8831 - val_srl_output_loss: 0.3515\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5864 - ner_output_accuracy: 0.9477 - ner_output_loss: 0.2198 - srl_output_accuracy: 0.8898 - srl_output_loss: 0.3666 - val_loss: 0.5458 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1961 - val_srl_output_accuracy: 0.8875 - val_srl_output_loss: 0.3497\n",
|
||||
"Epoch 7/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5000 - ner_output_accuracy: 0.9587 - ner_output_loss: 0.1634 - srl_output_accuracy: 0.8917 - srl_output_loss: 0.3366 - val_loss: 0.5364 - val_ner_output_accuracy: 0.9513 - val_ner_output_loss: 0.1899 - val_srl_output_accuracy: 0.8850 - val_srl_output_loss: 0.3465\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5877 - ner_output_accuracy: 0.9506 - ner_output_loss: 0.1914 - srl_output_accuracy: 0.8773 - srl_output_loss: 0.3963 - val_loss: 0.5260 - val_ner_output_accuracy: 0.9525 - val_ner_output_loss: 0.1898 - val_srl_output_accuracy: 0.8875 - val_srl_output_loss: 0.3362\n",
|
||||
"Epoch 8/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5526 - ner_output_accuracy: 0.9541 - ner_output_loss: 0.1791 - srl_output_accuracy: 0.8840 - srl_output_loss: 0.3735 - val_loss: 0.5054 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1799 - val_srl_output_accuracy: 0.8963 - val_srl_output_loss: 0.3256\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5046 - ner_output_accuracy: 0.9536 - ner_output_loss: 0.1756 - srl_output_accuracy: 0.8912 - srl_output_loss: 0.3290 - val_loss: 0.5094 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1829 - val_srl_output_accuracy: 0.8881 - val_srl_output_loss: 0.3265\n",
|
||||
"Epoch 9/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5094 - ner_output_accuracy: 0.9561 - ner_output_loss: 0.1701 - srl_output_accuracy: 0.8915 - srl_output_loss: 0.3393 - val_loss: 0.4881 - val_ner_output_accuracy: 0.9512 - val_ner_output_loss: 0.1707 - val_srl_output_accuracy: 0.9013 - val_srl_output_loss: 0.3174\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4807 - ner_output_accuracy: 0.9539 - ner_output_loss: 0.1704 - srl_output_accuracy: 0.9021 - srl_output_loss: 0.3103 - val_loss: 0.4876 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1719 - val_srl_output_accuracy: 0.9025 - val_srl_output_loss: 0.3156\n",
|
||||
"Epoch 10/10\n",
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4633 - ner_output_accuracy: 0.9524 - ner_output_loss: 0.1675 - srl_output_accuracy: 0.9092 - srl_output_loss: 0.2959 - val_loss: 0.4804 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1597 - val_srl_output_accuracy: 0.9050 - val_srl_output_loss: 0.3206\n"
|
||||
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4134 - ner_output_accuracy: 0.9634 - ner_output_loss: 0.1350 - srl_output_accuracy: 0.9245 - srl_output_loss: 0.2784 - val_loss: 0.4587 - val_ner_output_accuracy: 0.9550 - val_ner_output_loss: 0.1598 - val_srl_output_accuracy: 0.9087 - val_srl_output_loss: 0.2989\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -317,7 +324,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 124,
|
||||
"execution_count": 19,
|
||||
"id": "3a55990b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -325,23 +332,32 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'loss': 0.48035523295402527, 'compile_metrics': 0.15973526239395142, 'ner_output_loss': 0.32061997056007385, 'srl_output_loss': 0.953125}\n",
|
||||
"{'loss': 0.45865434408187866, 'compile_metrics': 0.159775510430336, 'ner_output_loss': 0.29887881875038147, 'srl_output_loss': 0.9550000429153442}\n",
|
||||
"{0: 'B-DATE', 1: 'B-ETH', 2: 'B-EVENT', 3: 'B-LOC', 4: 'B-MIN', 5: 'B-MISC', 6: 'B-ORG', 7: 'B-PER', 8: 'B-QUANT', 9: 'B-REL', 10: 'B-RES', 11: 'B-TERM', 12: 'B-TIME', 13: 'I-DATE', 14: 'I-ETH', 15: 'I-EVENT', 16: 'I-LOC', 17: 'I-MISC', 18: 'I-ORG', 19: 'I-PER', 20: 'I-QUANT', 21: 'I-RES', 22: 'I-TERM', 23: 'I-TIME', 24: 'O'}\n",
|
||||
"\n",
|
||||
"📊 [NER] Classification Report (test set):\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" DATE 0.25 0.12 0.17 8\n",
|
||||
" DATE 0.33 0.12 0.18 8\n",
|
||||
" EVENT 0.00 0.00 0.00 1\n",
|
||||
" LOC 0.50 0.04 0.07 28\n",
|
||||
" LOC 1.00 0.04 0.07 28\n",
|
||||
" ORG 0.00 0.00 0.00 4\n",
|
||||
" PER 0.00 0.00 0.00 2\n",
|
||||
" TIME 0.20 0.10 0.13 10\n",
|
||||
" TIME 0.50 0.30 0.37 10\n",
|
||||
"\n",
|
||||
" micro avg 0.27 0.06 0.09 53\n",
|
||||
" macro avg 0.16 0.04 0.06 53\n",
|
||||
"weighted avg 0.34 0.06 0.09 53\n",
|
||||
" micro avg 0.50 0.09 0.16 53\n",
|
||||
" macro avg 0.31 0.08 0.10 53\n",
|
||||
"weighted avg 0.67 0.09 0.13 53\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
|
||||
" _warn_prf(average, modifier, msg_start, len(result))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
@ -375,7 +391,7 @@
|
|||
"y_pred_ner, y_pred_srl = model.predict(X_test, verbose=0)\n",
|
||||
"\n",
|
||||
"true_ner, pred_ner = decode(y_pred_ner, y_ner_test, idx2tag_ner)\n",
|
||||
"\n",
|
||||
"print(idx2tag_ner)\n",
|
||||
"print(\"\\n📊 [NER] Classification Report (test set):\")\n",
|
||||
"print(classification_report(true_ner, pred_ner, digits=2))\n",
|
||||
"\n",
|
||||
|
@ -400,7 +416,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 125,
|
||||
"execution_count": 20,
|
||||
"id": "547d1533",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -408,28 +424,53 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PNC', 13: 'ARGM-PRD', 14: 'ARGM-PRP', 15: 'ARGM-SRC', 16: 'ARGM-TMP', 17: 'O', 18: 'R-ARG1', 19: 'V'}\n",
|
||||
"{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PRP', 13: 'ARGM-SRC', 14: 'ARGM-TMP', 15: 'O', 16: 'R-ARG1', 17: 'V'}\n",
|
||||
"\n",
|
||||
"📊 [SRL] Classification Report (test set):\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" CAU 0.00 0.00 0.00 1\n",
|
||||
" FRQ 0.00 0.00 0.00 1\n",
|
||||
" LOC 0.36 0.40 0.38 10\n",
|
||||
" LOC 0.31 0.50 0.38 10\n",
|
||||
" MNR 0.00 0.00 0.00 4\n",
|
||||
" PNC 0.00 0.00 0.00 1\n",
|
||||
" PRP 0.00 0.00 0.00 1\n",
|
||||
" RG0 0.31 0.21 0.25 19\n",
|
||||
" RG1 0.21 0.15 0.17 46\n",
|
||||
" RG2 0.19 0.40 0.26 10\n",
|
||||
" TMP 0.41 0.53 0.46 17\n",
|
||||
" _ 0.10 0.06 0.07 33\n",
|
||||
" RG0 0.50 0.11 0.17 19\n",
|
||||
" RG1 0.18 0.20 0.19 46\n",
|
||||
" RG2 0.27 0.40 0.32 10\n",
|
||||
" TMP 0.50 0.59 0.54 17\n",
|
||||
" _ 0.12 0.03 0.05 33\n",
|
||||
"\n",
|
||||
" micro avg 0.25 0.21 0.23 143\n",
|
||||
" macro avg 0.14 0.16 0.15 143\n",
|
||||
"weighted avg 0.22 0.21 0.21 143\n",
|
||||
" micro avg 0.28 0.22 0.24 142\n",
|
||||
" macro avg 0.19 0.18 0.17 142\n",
|
||||
"weighted avg 0.26 0.22 0.21 142\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG1 seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: V seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-TMP seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG0 seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-LOC seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-MNR seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-FRQ seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG2 seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-PRP seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
||||
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-CAU seems not to be NE tag.\n",
|
||||
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,270 @@
|
|||
"""
|
||||
qg_pipeline_static.py
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
Question Generation Encoder‑Decoder LSTM
|
||||
dengan fitur simbolik NER & SRL (pipeline statis).
|
||||
|
||||
Datasets:
|
||||
– train.jsonl / valid.jsonl (lihat format di fungsi `load_jsonl`)
|
||||
"""
|
||||
|
||||
import json, random, numpy as np, tensorflow as tf
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# 1. UTILITAS DASAR
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
SEED = 42
|
||||
random.seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
tf.random.set_seed(SEED)
|
||||
|
||||
TRAIN_FILE = "../dataset/dataset_qc.json"
|
||||
VALID_RATIO = 0.10
|
||||
MAX_CTX_LEN = 50
|
||||
MAX_Q_LEN = 30
|
||||
WORD_EMB_DIM = 128
|
||||
BATCH = 32
|
||||
EPOCHS = 15
|
||||
|
||||
SPECIALS_WORD = ("<pad>", "<unk>", "<bos>", "<eos>")
|
||||
SPECIALS_TAG = ("<pad>",)
|
||||
|
||||
|
||||
def load_jsonl(path):
|
||||
"""Muatt satu file JSON‑Lines. Setiap line = dict."""
|
||||
records = []
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
obj = json.loads(line)
|
||||
records.append(obj)
|
||||
return records
|
||||
|
||||
|
||||
def build_vocab(list_of_seq, specials):
|
||||
"""Bangun (token->id, id->token) dict dari kumpulan sekuens."""
|
||||
counter = Counter(tok for seq in list_of_seq for tok in seq)
|
||||
itos = list(specials) + [tok for tok, _ in counter.most_common()]
|
||||
stoi = {tok: i for i, tok in enumerate(itos)}
|
||||
return stoi, itos
|
||||
|
||||
|
||||
def encode(seq, tbl, max_len):
|
||||
ids = [tbl.get(tok, tbl["<unk>"]) for tok in seq]
|
||||
return (ids + [tbl["<pad>"]] * max_len)[:max_len]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# 2. DATA PREP
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
|
||||
# def prepare_training_data(file_path):
|
||||
# """Load → build vocab → encode ke numpy array."""
|
||||
# recs = load_jsonl(file_path)
|
||||
|
||||
# ctx, ner, srl, ques = [], [], [], []
|
||||
# for r in recs:
|
||||
# ctx.append(r["context_tokens"])
|
||||
# ner.append(r["ner_tags"])
|
||||
# srl.append(r["srl_tags"])
|
||||
# # tambahkan <bos>, <eos>
|
||||
# ques.append(["<bos>"] + r["question_tokens"] + ["<eos>"])
|
||||
|
||||
# # 2.1 vocab
|
||||
# w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2]) # <pad>,<unk>
|
||||
# w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD) # 4 specials
|
||||
# t2i_ner, _ = build_vocab(ner, SPECIALS_TAG)
|
||||
# t2i_srl, _ = build_vocab(srl, SPECIALS_TAG)
|
||||
|
||||
# # 2.2 encode & pad
|
||||
# X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx])
|
||||
# X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner])
|
||||
# X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl])
|
||||
|
||||
# Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques]) # bos..last-1
|
||||
# Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques]) # 2..eos
|
||||
|
||||
# return (
|
||||
# X_tok,
|
||||
# X_ner,
|
||||
# X_srl,
|
||||
# Y_in,
|
||||
# Y_out,
|
||||
# w2i_ctx,
|
||||
# i2w_ctx,
|
||||
# w2i_q,
|
||||
# i2w_q,
|
||||
# t2i_ner,
|
||||
# t2i_srl,
|
||||
# )
|
||||
|
||||
|
||||
# --- ganti fungsi lama ---
|
||||
def prepare_training_data(file_path):
|
||||
recs = load_jsonl(file_path)
|
||||
|
||||
ctx, ner, srl, ques, span_st, span_ed = [], [], [], [], [], []
|
||||
for r in recs:
|
||||
tokens = r["tokens"]
|
||||
ctx.append(tokens) # context_tokens
|
||||
|
||||
ner.append(r["ner"])
|
||||
srl.append(r["srl"])
|
||||
|
||||
# --- hitung answer_span otomatis ---
|
||||
ans_toks = r["answer"].split()
|
||||
try:
|
||||
start = next(
|
||||
i
|
||||
for i in range(len(tokens))
|
||||
if tokens[i : i + len(ans_toks)] == ans_toks
|
||||
)
|
||||
end = start + len(ans_toks) - 1
|
||||
except StopIteration:
|
||||
raise ValueError(
|
||||
f"Jawaban '{r['answer']}' tidak cocok dengan tokens {tokens}"
|
||||
)
|
||||
span_st.append(start)
|
||||
span_ed.append(end)
|
||||
|
||||
# question tokens: tokenisasi sederhana
|
||||
ques.append(["<bos>"] + r["question"].split() + ["<eos>"])
|
||||
|
||||
# ---------- build vocab sama persis ----------
|
||||
w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2])
|
||||
w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD)
|
||||
t2i_ner, _ = build_vocab(ner, SPECIALS_TAG)
|
||||
t2i_srl, _ = build_vocab(srl, SPECIALS_TAG)
|
||||
|
||||
# ---------- encode ----------
|
||||
X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx])
|
||||
X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner])
|
||||
X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl])
|
||||
|
||||
Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques])
|
||||
Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques])
|
||||
|
||||
# simpan span bila nanti mau copy‑mechanism
|
||||
spans = np.array(list(zip(span_st, span_ed))) # (N, 2)
|
||||
|
||||
return (
|
||||
X_tok,
|
||||
X_ner,
|
||||
X_srl,
|
||||
Y_in,
|
||||
Y_out,
|
||||
spans,
|
||||
w2i_ctx,
|
||||
i2w_ctx,
|
||||
w2i_q,
|
||||
i2w_q,
|
||||
t2i_ner,
|
||||
t2i_srl,
|
||||
)
|
||||
|
||||
|
||||
print("> Loading dataset …")
|
||||
|
||||
(X_tok, X_ner, X_srl, Y_in, Y_out, w2i_ctx, i2w_ctx, w2i_q, i2w_q, t2i_ner, t2i_srl) = (
|
||||
prepare_training_data(TRAIN_FILE)
|
||||
)
|
||||
|
||||
train_idx, valid_idx = train_test_split(
|
||||
np.arange(len(X_tok)), test_size=VALID_RATIO, random_state=SEED
|
||||
)
|
||||
|
||||
|
||||
def pick(arr, idx):
|
||||
return arr[idx]
|
||||
|
||||
|
||||
train_data = [pick(a, train_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)]
|
||||
valid_data = [pick(a, valid_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)]
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# 3. MODEL
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_model(vocab_ctx, vocab_q, n_ner, n_srl):
|
||||
tok_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="tok")
|
||||
ner_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="ner")
|
||||
srl_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="srl")
|
||||
dec_in = tf.keras.layers.Input((MAX_Q_LEN,), name="dec")
|
||||
|
||||
tok_emb = tf.keras.layers.Embedding(vocab_ctx, WORD_EMB_DIM, mask_zero=True)(tok_in)
|
||||
ner_emb = tf.keras.layers.Embedding(n_ner, 32, mask_zero=True)(ner_in)
|
||||
srl_emb = tf.keras.layers.Embedding(n_srl, 32, mask_zero=True)(srl_in)
|
||||
|
||||
enc_in = tf.keras.layers.Concatenate()([tok_emb, ner_emb, srl_emb])
|
||||
enc_out, fwd_h, fwd_c, bwd_h, bwd_c = tf.keras.layers.Bidirectional(
|
||||
tf.keras.layers.LSTM(WORD_EMB_DIM, return_sequences=True, return_state=True)
|
||||
)(enc_in)
|
||||
|
||||
state_h = tf.keras.layers.Concatenate()([fwd_h, bwd_h])
|
||||
state_c = tf.keras.layers.Concatenate()([fwd_c, bwd_c])
|
||||
|
||||
dec_emb = tf.keras.layers.Embedding(vocab_q, WORD_EMB_DIM, mask_zero=True)(dec_in)
|
||||
dec_lstm = tf.keras.layers.LSTM(
|
||||
WORD_EMB_DIM * 2, return_sequences=True, return_state=True
|
||||
)
|
||||
dec_out, _, _ = dec_lstm(dec_emb, initial_state=[state_h, state_c])
|
||||
|
||||
# Attention (dot)
|
||||
score = tf.keras.layers.Dot(axes=[2, 2])([dec_out, enc_out])
|
||||
attn_weights = tf.keras.layers.Activation("softmax")(score)
|
||||
context_vec = tf.keras.layers.Dot(axes=[2, 1])([attn_weights, enc_out])
|
||||
|
||||
dec_cat = tf.keras.layers.Concatenate()([dec_out, context_vec])
|
||||
outputs = tf.keras.layers.TimeDistributed(
|
||||
tf.keras.layers.Dense(vocab_q, activation="softmax")
|
||||
)(dec_cat)
|
||||
|
||||
mdl = tf.keras.Model([tok_in, ner_in, srl_in, dec_in], outputs)
|
||||
mdl.compile(
|
||||
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
|
||||
)
|
||||
return mdl
|
||||
|
||||
|
||||
print("> Building model …")
|
||||
model = build_model(len(w2i_ctx), len(w2i_q), len(t2i_ner), len(t2i_srl))
|
||||
model.summary(line_length=120)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# 4. DATA GENERATOR
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def generator(data, batch=BATCH):
|
||||
X_tok, X_ner, X_srl, Y_inp, Y_outp = data
|
||||
n = len(X_tok)
|
||||
while True:
|
||||
idx = np.random.permutation(n)
|
||||
for i in range(0, n, batch):
|
||||
b = idx[i : i + batch]
|
||||
yield [X_tok[b], X_ner[b], X_srl[b], Y_inp[b]], Y_outp[b][..., None]
|
||||
|
||||
|
||||
steps_train = len(train_idx) // BATCH
|
||||
steps_valid = len(valid_idx) // BATCH
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# 5. TRAIN
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
print("> Training …")
|
||||
_ = model.fit(
|
||||
generator(train_data),
|
||||
steps_per_epoch=steps_train,
|
||||
validation_data=generator(valid_data),
|
||||
validation_steps=steps_valid,
|
||||
epochs=EPOCHS,
|
||||
)
|
||||
|
||||
model.save("qg_lstm_static.h5")
|
||||
print("✓ Model saved to qg_lstm_static.h5")
|
|
@ -0,0 +1,58 @@
|
|||
MAX_CTX_LEN = 50
|
||||
|
||||
|
||||
# -- dummy placeholder untuk model NER/SRL Anda -------------------------------
|
||||
def predict_ner(tokens): # ganti sesuai implementasi
|
||||
return ["O"] * len(tokens)
|
||||
|
||||
|
||||
def predict_srl(tokens): # ganti sesuai implementasi
|
||||
return ["O"] * len(tokens)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def greedy_decode(context_tokens):
|
||||
"""Menghasilkan satu pertanyaan (greedy)."""
|
||||
# 6.1 Tagging
|
||||
ner_tags = predict_ner(context_tokens)
|
||||
srl_tags = predict_srl(context_tokens)
|
||||
|
||||
# 6.2 Encode
|
||||
ctx_ids = encode(context_tokens, w2i_ctx, MAX_CTX_LEN)[None]
|
||||
ner_ids = encode(ner_tags, t2i_ner, MAX_CTX_LEN)[None]
|
||||
srl_ids = encode(srl_tags, t2i_srl, MAX_CTX_LEN)[None]
|
||||
|
||||
dec_seq = [w2i_q["<bos>"]]
|
||||
for _ in range(MAX_Q_LEN - 1):
|
||||
dec_pad = dec_seq + [w2i_q["<pad>"]] * (MAX_Q_LEN - len(dec_seq))
|
||||
pred = model.predict(
|
||||
[ctx_ids, ner_ids, srl_ids, np.array([dec_pad])], verbose=0
|
||||
)
|
||||
next_id = int(pred[0, len(dec_seq) - 1].argmax())
|
||||
if i2w_q[next_id] == "<eos>":
|
||||
break
|
||||
dec_seq.append(next_id)
|
||||
|
||||
tokens_q = [i2w_q[t] for t in dec_seq[1:]]
|
||||
return " ".join(tokens_q)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sample = [
|
||||
"Keberagaman",
|
||||
"potensi",
|
||||
"sumber",
|
||||
"daya",
|
||||
"alam",
|
||||
"Indonesia",
|
||||
"tidak",
|
||||
"lepas",
|
||||
"dari",
|
||||
"proses",
|
||||
"geografis",
|
||||
".",
|
||||
]
|
||||
print("\n[CTX]", " ".join(sample))
|
||||
print("[Q] ", greedy_decode(sample))
|
|
@ -1,8 +1,136 @@
|
|||
[
|
||||
{
|
||||
"tokens": ["Barack", "Obama", "lahir", "di", "Hawaii", "."],
|
||||
"ner": ["B-PER", "I-PER", "O", "O", "B-LOC", "O"],
|
||||
"srl": ["B-ARG0", "I-ARG0", "B-V", "B-ARGM-LOC", "I-ARGM-LOC", "O"],
|
||||
"question": "___ lahir di Hawaii.",
|
||||
"answer": "Barack Obama",
|
||||
"tokens": [
|
||||
"R.",
|
||||
"Soewardi",
|
||||
"Soerjaningrat",
|
||||
"adalah",
|
||||
"putra",
|
||||
"GPH",
|
||||
"Soerjaningrat",
|
||||
"dan",
|
||||
"cucu",
|
||||
"Pakualam",
|
||||
"III",
|
||||
"."
|
||||
],
|
||||
"ner": [
|
||||
"B-PER",
|
||||
"I-PER",
|
||||
"I-PER",
|
||||
"O",
|
||||
"O",
|
||||
"B-PER",
|
||||
"I-PER",
|
||||
"O",
|
||||
"O",
|
||||
"B-PER",
|
||||
"I-PER",
|
||||
"O"
|
||||
],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"V",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"O"
|
||||
],
|
||||
"question": "___ adalah putra GPH Soerjaningrat dan cucu Pakualam III.",
|
||||
"answer": "R. Soewardi Soerjaningrat",
|
||||
"type": "isian"
|
||||
},
|
||||
{
|
||||
"tokens": ["Ia", "lantas", "diterima", "belajar", "di", "STOVIA", "."],
|
||||
"ner": ["O", "O", "O", "O", "O", "B-ORG", "O"],
|
||||
"srl": ["ARG0", "O", "V", "ARG1", "O", "ARGM-LOC", "O"],
|
||||
"question": "Ia diterima belajar di ___.",
|
||||
"answer": "STOVIA",
|
||||
"type": "isian"
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
"Ia",
|
||||
"bersama",
|
||||
"Douwes",
|
||||
"Dekker",
|
||||
"dan",
|
||||
"dr.",
|
||||
"Cipto",
|
||||
"Mangoenkoesoemo",
|
||||
"lantas",
|
||||
"mendirikan",
|
||||
"Indische",
|
||||
"Partij",
|
||||
"pada",
|
||||
"25",
|
||||
"Desember",
|
||||
"1912",
|
||||
"."
|
||||
],
|
||||
"ner": [
|
||||
"O",
|
||||
"O",
|
||||
"B-PER",
|
||||
"I-PER",
|
||||
"O",
|
||||
"B-PER",
|
||||
"I-PER",
|
||||
"I-PER",
|
||||
"O",
|
||||
"O",
|
||||
"B-ORG",
|
||||
"I-ORG",
|
||||
"O",
|
||||
"B-DATE",
|
||||
"I-DATE",
|
||||
"I-DATE",
|
||||
"O"
|
||||
],
|
||||
"srl": [
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"ARG0",
|
||||
"O",
|
||||
"V",
|
||||
"ARG1",
|
||||
"ARG1",
|
||||
"O",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"ARGM-TMP",
|
||||
"O"
|
||||
],
|
||||
"question": "Ia bersama Douwes Dekker dan dr. Cipto Mangoenkoesoemo lantas mendirikan ___ pada 25 Desember 1912.",
|
||||
"answer": "Indische Partij",
|
||||
"type": "isian"
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
"Indische",
|
||||
"Partij",
|
||||
"didirikan",
|
||||
"pada",
|
||||
"25",
|
||||
"Desember",
|
||||
"1912",
|
||||
"."
|
||||
],
|
||||
"ner": ["B-ORG", "I-ORG", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"],
|
||||
"srl": ["ARG1", "ARG1", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O"],
|
||||
"question": "Indische Partij didirikan pada tanggal ___.",
|
||||
"answer": "25 Desember 1912",
|
||||
"type": "isian"
|
||||
}
|
||||
]
|
||||
|
|
|
@ -2010,3 +2010,43 @@ nasi O ARG1
|
|||
di O O
|
||||
dapur B-LOC ARGM-LOC
|
||||
. O O
|
||||
|
||||
R. B=PER ARG0
|
||||
Soewardi I-PER ARG0
|
||||
Soerjaningrat I-PER ARG0
|
||||
adalah O V
|
||||
putra O ARG1
|
||||
GPH B-PER ARG1
|
||||
Soerjaningrat I-PER ARG1
|
||||
dan O ARG1
|
||||
cucu O ARG1
|
||||
Pakualam B-PER ARG1
|
||||
III I-PER ARG1
|
||||
.
|
||||
|
||||
Ia O ARG0
|
||||
bersama O ARG0
|
||||
Douwes B-PER ARG0
|
||||
Dekker I-PER ARG0
|
||||
dan O ARG0
|
||||
dr. B-PER ARG0
|
||||
Cipto I-PER ARG0
|
||||
Mangoenkoesoemo I-PER ARG0
|
||||
lantas O O
|
||||
mendirikan O V
|
||||
Indische B-ORG ARG1
|
||||
Partij I-ORG ARG1
|
||||
pada O O
|
||||
25 B-DATE ARGM-TMP
|
||||
Desember I-DATE ARGM-TMP
|
||||
1912 I-DATE ARGM-TMP
|
||||
. O O
|
||||
|
||||
Indische B-ORG ARG1
|
||||
Partij I-ORG ARG1
|
||||
didirikan O V
|
||||
pada O O
|
||||
25 B-DATE ARGM-TMP
|
||||
Desember I-DATE ARGM-TMP
|
||||
1912 I-DATE ARGM-TMP
|
||||
. O O
|
Can't render this file because it has a wrong number of fields in line 2025.
|
Loading…
Reference in New Issue