feat: adding qc model and adding dataset

This commit is contained in:
akhdanre 2025-04-22 22:53:18 +07:00
parent 3a04f94fb3
commit 1c270d4e75
7 changed files with 602 additions and 65 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 13,
"id": "fb106e20",
"metadata": {},
"outputs": [],
@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": 14,
"id": "00347a5f",
"metadata": {},
"outputs": [
@ -54,7 +54,6 @@
" for line in f:\n",
" line = line.strip()\n",
" if not line:\n",
" # Jika baris kosong → akhir kalimat\n",
" if tokens:\n",
" data.append({\n",
" \"tokens\": tokens,\n",
@ -82,7 +81,15 @@
},
{
"cell_type": "code",
"execution_count": 120,
"execution_count": null,
"id": "3793950a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ac8eb374",
"metadata": {},
"outputs": [],
@ -103,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 121,
"execution_count": 16,
"id": "80356f1f",
"metadata": {},
"outputs": [],
@ -130,7 +137,7 @@
},
{
"cell_type": "code",
"execution_count": 122,
"execution_count": 17,
"id": "fe219c96",
"metadata": {},
"outputs": [],
@ -138,25 +145,25 @@
"X_train, X_test, y_ner_train, y_ner_test, y_srl_train, y_srl_test = train_test_split(\n",
" X, y_ner, y_srl, \n",
" test_size=0.20, \n",
" random_state=42, # supaya reproducible\n",
" shuffle=True # acak baris\n",
" random_state=42,\n",
" shuffle=True \n",
")"
]
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 18,
"id": "7a9636b6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_13\"</span>\n",
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_1\"</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1mModel: \"functional_13\"\u001b[0m\n"
"\u001b[1mModel: \"functional_1\"\u001b[0m\n"
]
},
"metadata": {},
@ -168,19 +175,19 @@
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ input_layer_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ input_layer_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">44,544</span> │ input_layer_13[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ embedding_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">44,544</span> │ input_layer_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bidirectional_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">66,048</span> │ embedding_13[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ bidirectional_1 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">66,048</span> │ embedding_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Bidirectional</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">25</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,225</span> │ bidirectional_13… │\n",
"│ ner_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">25</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,225</span> │ bidirectional_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ srl_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">20</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,580</span> │ bidirectional_13… │\n",
"│ srl_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">18</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,322</span> │ bidirectional_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"</pre>\n"
@ -189,19 +196,19 @@
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ input_layer_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ input_layer_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_13[\u001b[38;5;34m0\u001b[0m… │\n",
"│ embedding_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_1[\u001b[38;5;34m0\u001b[0m]… │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bidirectional_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_13[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ bidirectional_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_13… │\n",
"│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_1[\u001b[38;5;34m\u001b[0m │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m2,580\u001b[0m │ bidirectional_13… │\n",
"│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m18\u001b[0m) │ \u001b[38;5;34m2,322\u001b[0m │ bidirectional_1[\u001b[38;5;34m…\u001b[0m │\n",
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
@ -212,11 +219,11 @@
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,397</span> (454.68 KB)\n",
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,139</span> (453.67 KB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n"
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,139\u001b[0m (453.67 KB)\n"
]
},
"metadata": {},
@ -225,11 +232,11 @@
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,397</span> (454.68 KB)\n",
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,139</span> (453.67 KB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n"
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,139\u001b[0m (453.67 KB)\n"
]
},
"metadata": {},
@ -253,25 +260,25 @@
"output_type": "stream",
"text": [
"Epoch 1/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.3010 - ner_output_accuracy: 0.8807 - ner_output_loss: 1.5617 - srl_output_accuracy: 0.7456 - srl_output_loss: 1.7393 - val_loss: 0.7284 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2466 - val_srl_output_accuracy: 0.8300 - val_srl_output_loss: 0.4818\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.2850 - ner_output_accuracy: 0.8700 - ner_output_loss: 1.6767 - srl_output_accuracy: 0.7518 - srl_output_loss: 1.6083 - val_loss: 0.7275 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2555 - val_srl_output_accuracy: 0.8450 - val_srl_output_loss: 0.4720\n",
"Epoch 2/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7355 - ner_output_accuracy: 0.9569 - ner_output_loss: 0.2279 - srl_output_accuracy: 0.8297 - srl_output_loss: 0.5076 - val_loss: 0.6655 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2323 - val_srl_output_accuracy: 0.8506 - val_srl_output_loss: 0.4332\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7622 - ner_output_accuracy: 0.9528 - ner_output_loss: 0.2458 - srl_output_accuracy: 0.8296 - srl_output_loss: 0.5163 - val_loss: 0.6534 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2296 - val_srl_output_accuracy: 0.8531 - val_srl_output_loss: 0.4238\n",
"Epoch 3/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7041 - ner_output_accuracy: 0.9522 - ner_output_loss: 0.2219 - srl_output_accuracy: 0.8488 - srl_output_loss: 0.4822 - val_loss: 0.6368 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2232 - val_srl_output_accuracy: 0.8744 - val_srl_output_loss: 0.4135\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6875 - ner_output_accuracy: 0.9572 - ner_output_loss: 0.2126 - srl_output_accuracy: 0.8496 - srl_output_loss: 0.4750 - val_loss: 0.6327 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2273 - val_srl_output_accuracy: 0.8688 - val_srl_output_loss: 0.4054\n",
"Epoch 4/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6864 - ner_output_accuracy: 0.9520 - ner_output_loss: 0.2184 - srl_output_accuracy: 0.8548 - srl_output_loss: 0.4680 - val_loss: 0.6078 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2193 - val_srl_output_accuracy: 0.8769 - val_srl_output_loss: 0.3885\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.6103 - ner_output_accuracy: 0.9533 - ner_output_loss: 0.2114 - srl_output_accuracy: 0.8772 - srl_output_loss: 0.3988 - val_loss: 0.6009 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2137 - val_srl_output_accuracy: 0.8662 - val_srl_output_loss: 0.3872\n",
"Epoch 5/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6304 - ner_output_accuracy: 0.9545 - ner_output_loss: 0.2009 - srl_output_accuracy: 0.8675 - srl_output_loss: 0.4295 - val_loss: 0.5727 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2015 - val_srl_output_accuracy: 0.8812 - val_srl_output_loss: 0.3711\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6757 - ner_output_accuracy: 0.9486 - ner_output_loss: 0.2281 - srl_output_accuracy: 0.8582 - srl_output_loss: 0.4476 - val_loss: 0.5690 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2040 - val_srl_output_accuracy: 0.8781 - val_srl_output_loss: 0.3650\n",
"Epoch 6/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5679 - ner_output_accuracy: 0.9557 - ner_output_loss: 0.1749 - srl_output_accuracy: 0.8783 - srl_output_loss: 0.3930 - val_loss: 0.5471 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1956 - val_srl_output_accuracy: 0.8831 - val_srl_output_loss: 0.3515\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5864 - ner_output_accuracy: 0.9477 - ner_output_loss: 0.2198 - srl_output_accuracy: 0.8898 - srl_output_loss: 0.3666 - val_loss: 0.5458 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1961 - val_srl_output_accuracy: 0.8875 - val_srl_output_loss: 0.3497\n",
"Epoch 7/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5000 - ner_output_accuracy: 0.9587 - ner_output_loss: 0.1634 - srl_output_accuracy: 0.8917 - srl_output_loss: 0.3366 - val_loss: 0.5364 - val_ner_output_accuracy: 0.9513 - val_ner_output_loss: 0.1899 - val_srl_output_accuracy: 0.8850 - val_srl_output_loss: 0.3465\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5877 - ner_output_accuracy: 0.9506 - ner_output_loss: 0.1914 - srl_output_accuracy: 0.8773 - srl_output_loss: 0.3963 - val_loss: 0.5260 - val_ner_output_accuracy: 0.9525 - val_ner_output_loss: 0.1898 - val_srl_output_accuracy: 0.8875 - val_srl_output_loss: 0.3362\n",
"Epoch 8/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5526 - ner_output_accuracy: 0.9541 - ner_output_loss: 0.1791 - srl_output_accuracy: 0.8840 - srl_output_loss: 0.3735 - val_loss: 0.5054 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1799 - val_srl_output_accuracy: 0.8963 - val_srl_output_loss: 0.3256\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5046 - ner_output_accuracy: 0.9536 - ner_output_loss: 0.1756 - srl_output_accuracy: 0.8912 - srl_output_loss: 0.3290 - val_loss: 0.5094 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1829 - val_srl_output_accuracy: 0.8881 - val_srl_output_loss: 0.3265\n",
"Epoch 9/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5094 - ner_output_accuracy: 0.9561 - ner_output_loss: 0.1701 - srl_output_accuracy: 0.8915 - srl_output_loss: 0.3393 - val_loss: 0.4881 - val_ner_output_accuracy: 0.9512 - val_ner_output_loss: 0.1707 - val_srl_output_accuracy: 0.9013 - val_srl_output_loss: 0.3174\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4807 - ner_output_accuracy: 0.9539 - ner_output_loss: 0.1704 - srl_output_accuracy: 0.9021 - srl_output_loss: 0.3103 - val_loss: 0.4876 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1719 - val_srl_output_accuracy: 0.9025 - val_srl_output_loss: 0.3156\n",
"Epoch 10/10\n",
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4633 - ner_output_accuracy: 0.9524 - ner_output_loss: 0.1675 - srl_output_accuracy: 0.9092 - srl_output_loss: 0.2959 - val_loss: 0.4804 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1597 - val_srl_output_accuracy: 0.9050 - val_srl_output_loss: 0.3206\n"
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4134 - ner_output_accuracy: 0.9634 - ner_output_loss: 0.1350 - srl_output_accuracy: 0.9245 - srl_output_loss: 0.2784 - val_loss: 0.4587 - val_ner_output_accuracy: 0.9550 - val_ner_output_loss: 0.1598 - val_srl_output_accuracy: 0.9087 - val_srl_output_loss: 0.2989\n"
]
}
],
@ -317,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 19,
"id": "3a55990b",
"metadata": {},
"outputs": [
@ -325,23 +332,32 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'loss': 0.48035523295402527, 'compile_metrics': 0.15973526239395142, 'ner_output_loss': 0.32061997056007385, 'srl_output_loss': 0.953125}\n",
"{'loss': 0.45865434408187866, 'compile_metrics': 0.159775510430336, 'ner_output_loss': 0.29887881875038147, 'srl_output_loss': 0.9550000429153442}\n",
"{0: 'B-DATE', 1: 'B-ETH', 2: 'B-EVENT', 3: 'B-LOC', 4: 'B-MIN', 5: 'B-MISC', 6: 'B-ORG', 7: 'B-PER', 8: 'B-QUANT', 9: 'B-REL', 10: 'B-RES', 11: 'B-TERM', 12: 'B-TIME', 13: 'I-DATE', 14: 'I-ETH', 15: 'I-EVENT', 16: 'I-LOC', 17: 'I-MISC', 18: 'I-ORG', 19: 'I-PER', 20: 'I-QUANT', 21: 'I-RES', 22: 'I-TERM', 23: 'I-TIME', 24: 'O'}\n",
"\n",
"📊 [NER] Classification Report (test set):\n",
" precision recall f1-score support\n",
"\n",
" DATE 0.25 0.12 0.17 8\n",
" DATE 0.33 0.12 0.18 8\n",
" EVENT 0.00 0.00 0.00 1\n",
" LOC 0.50 0.04 0.07 28\n",
" LOC 1.00 0.04 0.07 28\n",
" ORG 0.00 0.00 0.00 4\n",
" PER 0.00 0.00 0.00 2\n",
" TIME 0.20 0.10 0.13 10\n",
" TIME 0.50 0.30 0.37 10\n",
"\n",
" micro avg 0.27 0.06 0.09 53\n",
" macro avg 0.16 0.04 0.06 53\n",
"weighted avg 0.34 0.06 0.09 53\n",
" micro avg 0.50 0.09 0.16 53\n",
" macro avg 0.31 0.08 0.10 53\n",
"weighted avg 0.67 0.09 0.13 53\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, msg_start, len(result))\n"
]
}
],
"source": [
@ -375,7 +391,7 @@
"y_pred_ner, y_pred_srl = model.predict(X_test, verbose=0)\n",
"\n",
"true_ner, pred_ner = decode(y_pred_ner, y_ner_test, idx2tag_ner)\n",
"\n",
"print(idx2tag_ner)\n",
"print(\"\\n📊 [NER] Classification Report (test set):\")\n",
"print(classification_report(true_ner, pred_ner, digits=2))\n",
"\n",
@ -400,7 +416,7 @@
},
{
"cell_type": "code",
"execution_count": 125,
"execution_count": 20,
"id": "547d1533",
"metadata": {},
"outputs": [
@ -408,28 +424,53 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PNC', 13: 'ARGM-PRD', 14: 'ARGM-PRP', 15: 'ARGM-SRC', 16: 'ARGM-TMP', 17: 'O', 18: 'R-ARG1', 19: 'V'}\n",
"{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PRP', 13: 'ARGM-SRC', 14: 'ARGM-TMP', 15: 'O', 16: 'R-ARG1', 17: 'V'}\n",
"\n",
"📊 [SRL] Classification Report (test set):\n",
" precision recall f1-score support\n",
"\n",
" CAU 0.00 0.00 0.00 1\n",
" FRQ 0.00 0.00 0.00 1\n",
" LOC 0.36 0.40 0.38 10\n",
" LOC 0.31 0.50 0.38 10\n",
" MNR 0.00 0.00 0.00 4\n",
" PNC 0.00 0.00 0.00 1\n",
" PRP 0.00 0.00 0.00 1\n",
" RG0 0.31 0.21 0.25 19\n",
" RG1 0.21 0.15 0.17 46\n",
" RG2 0.19 0.40 0.26 10\n",
" TMP 0.41 0.53 0.46 17\n",
" _ 0.10 0.06 0.07 33\n",
" RG0 0.50 0.11 0.17 19\n",
" RG1 0.18 0.20 0.19 46\n",
" RG2 0.27 0.40 0.32 10\n",
" TMP 0.50 0.59 0.54 17\n",
" _ 0.12 0.03 0.05 33\n",
"\n",
" micro avg 0.25 0.21 0.23 143\n",
" macro avg 0.14 0.16 0.15 143\n",
"weighted avg 0.22 0.21 0.21 143\n",
" micro avg 0.28 0.22 0.24 142\n",
" macro avg 0.19 0.18 0.17 142\n",
"weighted avg 0.26 0.22 0.21 142\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG1 seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: V seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-TMP seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG0 seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-LOC seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-MNR seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-FRQ seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG2 seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-PRP seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
"/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-CAU seems not to be NE tag.\n",
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n"
]
}
],
"source": [

Binary file not shown.

View File

@ -0,0 +1,270 @@
"""
qg_pipeline_static.py
~~~~~~~~~~~~~~~~~~~~~
Question Generation EncoderDecoder LSTM
dengan fitur simbolik NER & SRL (pipeline statis).
Datasets:
train.jsonl / valid.jsonl (lihat format di fungsi `load_jsonl`)
"""
import json, random, numpy as np, tensorflow as tf
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split
# ------------------------------------------------------------------------------
# 1. UTILITAS DASAR
# ------------------------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
TRAIN_FILE = "../dataset/dataset_qc.json"
VALID_RATIO = 0.10
MAX_CTX_LEN = 50
MAX_Q_LEN = 30
WORD_EMB_DIM = 128
BATCH = 32
EPOCHS = 15
SPECIALS_WORD = ("<pad>", "<unk>", "<bos>", "<eos>")
SPECIALS_TAG = ("<pad>",)
def load_jsonl(path):
"""Muatt satu file JSONLines. Setiap line = dict."""
records = []
with open(path, encoding="utf-8") as f:
for line in f:
obj = json.loads(line)
records.append(obj)
return records
def build_vocab(list_of_seq, specials):
"""Bangun (token->id, id->token) dict dari kumpulan sekuens."""
counter = Counter(tok for seq in list_of_seq for tok in seq)
itos = list(specials) + [tok for tok, _ in counter.most_common()]
stoi = {tok: i for i, tok in enumerate(itos)}
return stoi, itos
def encode(seq, tbl, max_len):
ids = [tbl.get(tok, tbl["<unk>"]) for tok in seq]
return (ids + [tbl["<pad>"]] * max_len)[:max_len]
# ------------------------------------------------------------------------------
# 2. DATA PREP
# ------------------------------------------------------------------------------
# def prepare_training_data(file_path):
# """Load → build vocab → encode ke numpy array."""
# recs = load_jsonl(file_path)
# ctx, ner, srl, ques = [], [], [], []
# for r in recs:
# ctx.append(r["context_tokens"])
# ner.append(r["ner_tags"])
# srl.append(r["srl_tags"])
# # tambahkan <bos>, <eos>
# ques.append(["<bos>"] + r["question_tokens"] + ["<eos>"])
# # 2.1 vocab
# w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2]) # <pad>,<unk>
# w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD) # 4 specials
# t2i_ner, _ = build_vocab(ner, SPECIALS_TAG)
# t2i_srl, _ = build_vocab(srl, SPECIALS_TAG)
# # 2.2 encode & pad
# X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx])
# X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner])
# X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl])
# Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques]) # bos..last-1
# Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques]) # 2..eos
# return (
# X_tok,
# X_ner,
# X_srl,
# Y_in,
# Y_out,
# w2i_ctx,
# i2w_ctx,
# w2i_q,
# i2w_q,
# t2i_ner,
# t2i_srl,
# )
# --- ganti fungsi lama ---
def prepare_training_data(file_path):
recs = load_jsonl(file_path)
ctx, ner, srl, ques, span_st, span_ed = [], [], [], [], [], []
for r in recs:
tokens = r["tokens"]
ctx.append(tokens) # context_tokens
ner.append(r["ner"])
srl.append(r["srl"])
# --- hitung answer_span otomatis ---
ans_toks = r["answer"].split()
try:
start = next(
i
for i in range(len(tokens))
if tokens[i : i + len(ans_toks)] == ans_toks
)
end = start + len(ans_toks) - 1
except StopIteration:
raise ValueError(
f"Jawaban '{r['answer']}' tidak cocok dengan tokens {tokens}"
)
span_st.append(start)
span_ed.append(end)
# question tokens: tokenisasi sederhana
ques.append(["<bos>"] + r["question"].split() + ["<eos>"])
# ---------- build vocab sama persis ----------
w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2])
w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD)
t2i_ner, _ = build_vocab(ner, SPECIALS_TAG)
t2i_srl, _ = build_vocab(srl, SPECIALS_TAG)
# ---------- encode ----------
X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx])
X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner])
X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl])
Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques])
Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques])
# simpan span bila nanti mau copymechanism
spans = np.array(list(zip(span_st, span_ed))) # (N, 2)
return (
X_tok,
X_ner,
X_srl,
Y_in,
Y_out,
spans,
w2i_ctx,
i2w_ctx,
w2i_q,
i2w_q,
t2i_ner,
t2i_srl,
)
print("> Loading dataset …")
(X_tok, X_ner, X_srl, Y_in, Y_out, w2i_ctx, i2w_ctx, w2i_q, i2w_q, t2i_ner, t2i_srl) = (
prepare_training_data(TRAIN_FILE)
)
train_idx, valid_idx = train_test_split(
np.arange(len(X_tok)), test_size=VALID_RATIO, random_state=SEED
)
def pick(arr, idx):
return arr[idx]
train_data = [pick(a, train_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)]
valid_data = [pick(a, valid_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)]
# ------------------------------------------------------------------------------
# 3. MODEL
# ------------------------------------------------------------------------------
def build_model(vocab_ctx, vocab_q, n_ner, n_srl):
tok_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="tok")
ner_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="ner")
srl_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="srl")
dec_in = tf.keras.layers.Input((MAX_Q_LEN,), name="dec")
tok_emb = tf.keras.layers.Embedding(vocab_ctx, WORD_EMB_DIM, mask_zero=True)(tok_in)
ner_emb = tf.keras.layers.Embedding(n_ner, 32, mask_zero=True)(ner_in)
srl_emb = tf.keras.layers.Embedding(n_srl, 32, mask_zero=True)(srl_in)
enc_in = tf.keras.layers.Concatenate()([tok_emb, ner_emb, srl_emb])
enc_out, fwd_h, fwd_c, bwd_h, bwd_c = tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(WORD_EMB_DIM, return_sequences=True, return_state=True)
)(enc_in)
state_h = tf.keras.layers.Concatenate()([fwd_h, bwd_h])
state_c = tf.keras.layers.Concatenate()([fwd_c, bwd_c])
dec_emb = tf.keras.layers.Embedding(vocab_q, WORD_EMB_DIM, mask_zero=True)(dec_in)
dec_lstm = tf.keras.layers.LSTM(
WORD_EMB_DIM * 2, return_sequences=True, return_state=True
)
dec_out, _, _ = dec_lstm(dec_emb, initial_state=[state_h, state_c])
# Attention (dot)
score = tf.keras.layers.Dot(axes=[2, 2])([dec_out, enc_out])
attn_weights = tf.keras.layers.Activation("softmax")(score)
context_vec = tf.keras.layers.Dot(axes=[2, 1])([attn_weights, enc_out])
dec_cat = tf.keras.layers.Concatenate()([dec_out, context_vec])
outputs = tf.keras.layers.TimeDistributed(
tf.keras.layers.Dense(vocab_q, activation="softmax")
)(dec_cat)
mdl = tf.keras.Model([tok_in, ner_in, srl_in, dec_in], outputs)
mdl.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return mdl
print("> Building model …")
model = build_model(len(w2i_ctx), len(w2i_q), len(t2i_ner), len(t2i_srl))
model.summary(line_length=120)
# ------------------------------------------------------------------------------
# 4. DATA GENERATOR
# ------------------------------------------------------------------------------
def generator(data, batch=BATCH):
X_tok, X_ner, X_srl, Y_inp, Y_outp = data
n = len(X_tok)
while True:
idx = np.random.permutation(n)
for i in range(0, n, batch):
b = idx[i : i + batch]
yield [X_tok[b], X_ner[b], X_srl[b], Y_inp[b]], Y_outp[b][..., None]
steps_train = len(train_idx) // BATCH
steps_valid = len(valid_idx) // BATCH
# ------------------------------------------------------------------------------
# 5. TRAIN
# ------------------------------------------------------------------------------
print("> Training …")
_ = model.fit(
generator(train_data),
steps_per_epoch=steps_train,
validation_data=generator(valid_data),
validation_steps=steps_valid,
epochs=EPOCHS,
)
model.save("qg_lstm_static.h5")
print("✓ Model saved to qg_lstm_static.h5")

58
QC/test_model_qc.py Normal file
View File

@ -0,0 +1,58 @@
MAX_CTX_LEN = 50
# -- dummy placeholder untuk model NER/SRL Anda -------------------------------
def predict_ner(tokens): # ganti sesuai implementasi
return ["O"] * len(tokens)
def predict_srl(tokens): # ganti sesuai implementasi
return ["O"] * len(tokens)
# ------------------------------------------------------------------------------
def greedy_decode(context_tokens):
"""Menghasilkan satu pertanyaan (greedy)."""
# 6.1 Tagging
ner_tags = predict_ner(context_tokens)
srl_tags = predict_srl(context_tokens)
# 6.2 Encode
ctx_ids = encode(context_tokens, w2i_ctx, MAX_CTX_LEN)[None]
ner_ids = encode(ner_tags, t2i_ner, MAX_CTX_LEN)[None]
srl_ids = encode(srl_tags, t2i_srl, MAX_CTX_LEN)[None]
dec_seq = [w2i_q["<bos>"]]
for _ in range(MAX_Q_LEN - 1):
dec_pad = dec_seq + [w2i_q["<pad>"]] * (MAX_Q_LEN - len(dec_seq))
pred = model.predict(
[ctx_ids, ner_ids, srl_ids, np.array([dec_pad])], verbose=0
)
next_id = int(pred[0, len(dec_seq) - 1].argmax())
if i2w_q[next_id] == "<eos>":
break
dec_seq.append(next_id)
tokens_q = [i2w_q[t] for t in dec_seq[1:]]
return " ".join(tokens_q)
if __name__ == "__main__":
sample = [
"Keberagaman",
"potensi",
"sumber",
"daya",
"alam",
"Indonesia",
"tidak",
"lepas",
"dari",
"proses",
"geografis",
".",
]
print("\n[CTX]", " ".join(sample))
print("[Q] ", greedy_decode(sample))

View File

@ -1,8 +1,136 @@
{
"tokens": ["Barack", "Obama", "lahir", "di", "Hawaii", "."],
"ner": ["B-PER", "I-PER", "O", "O", "B-LOC", "O"],
"srl": ["B-ARG0", "I-ARG0", "B-V", "B-ARGM-LOC", "I-ARGM-LOC", "O"],
"question": "___ lahir di Hawaii.",
"answer": "Barack Obama",
"type": "isian"
}
[
{
"tokens": [
"R.",
"Soewardi",
"Soerjaningrat",
"adalah",
"putra",
"GPH",
"Soerjaningrat",
"dan",
"cucu",
"Pakualam",
"III",
"."
],
"ner": [
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-PER",
"I-PER",
"O",
"O",
"B-PER",
"I-PER",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"O"
],
"question": "___ adalah putra GPH Soerjaningrat dan cucu Pakualam III.",
"answer": "R. Soewardi Soerjaningrat",
"type": "isian"
},
{
"tokens": ["Ia", "lantas", "diterima", "belajar", "di", "STOVIA", "."],
"ner": ["O", "O", "O", "O", "O", "B-ORG", "O"],
"srl": ["ARG0", "O", "V", "ARG1", "O", "ARGM-LOC", "O"],
"question": "Ia diterima belajar di ___.",
"answer": "STOVIA",
"type": "isian"
},
{
"tokens": [
"Ia",
"bersama",
"Douwes",
"Dekker",
"dan",
"dr.",
"Cipto",
"Mangoenkoesoemo",
"lantas",
"mendirikan",
"Indische",
"Partij",
"pada",
"25",
"Desember",
"1912",
"."
],
"ner": [
"O",
"O",
"B-PER",
"I-PER",
"O",
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-ORG",
"I-ORG",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"O",
"V",
"ARG1",
"ARG1",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Ia bersama Douwes Dekker dan dr. Cipto Mangoenkoesoemo lantas mendirikan ___ pada 25 Desember 1912.",
"answer": "Indische Partij",
"type": "isian"
},
{
"tokens": [
"Indische",
"Partij",
"didirikan",
"pada",
"25",
"Desember",
"1912",
"."
],
"ner": ["B-ORG", "I-ORG", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"],
"srl": ["ARG1", "ARG1", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O"],
"question": "Indische Partij didirikan pada tanggal ___.",
"answer": "25 Desember 1912",
"type": "isian"
}
]

View File

@ -2009,4 +2009,44 @@ memasak O V
nasi O ARG1
di O O
dapur B-LOC ARGM-LOC
. O O
. O O
R. B=PER ARG0
Soewardi I-PER ARG0
Soerjaningrat I-PER ARG0
adalah O V
putra O ARG1
GPH B-PER ARG1
Soerjaningrat I-PER ARG1
dan O ARG1
cucu O ARG1
Pakualam B-PER ARG1
III I-PER ARG1
.
Ia O ARG0
bersama O ARG0
Douwes B-PER ARG0
Dekker I-PER ARG0
dan O ARG0
dr. B-PER ARG0
Cipto I-PER ARG0
Mangoenkoesoemo I-PER ARG0
lantas O O
mendirikan O V
Indische B-ORG ARG1
Partij I-ORG ARG1
pada O O
25 B-DATE ARGM-TMP
Desember I-DATE ARGM-TMP
1912 I-DATE ARGM-TMP
. O O
Indische B-ORG ARG1
Partij I-ORG ARG1
didirikan O V
pada O O
25 B-DATE ARGM-TMP
Desember I-DATE ARGM-TMP
1912 I-DATE ARGM-TMP
. O O
Can't render this file because it has a wrong number of fields in line 2025.