diff --git a/NER_SRL/multi_task_lstm_ner_srl_model.keras b/NER_SRL/multi_task_lstm_ner_srl_model.keras index 6058696..e3f1449 100644 Binary files a/NER_SRL/multi_task_lstm_ner_srl_model.keras and b/NER_SRL/multi_task_lstm_ner_srl_model.keras differ diff --git a/NER_SRL/new_lstm_ner_srl.ipynb b/NER_SRL/new_lstm_ner_srl.ipynb index 3a3fbcb..dc0a8b5 100644 --- a/NER_SRL/new_lstm_ner_srl.ipynb +++ b/NER_SRL/new_lstm_ner_srl.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 118, + "execution_count": 13, "id": "fb106e20", "metadata": {}, "outputs": [], @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 14, "id": "00347a5f", "metadata": {}, "outputs": [ @@ -54,7 +54,6 @@ " for line in f:\n", " line = line.strip()\n", " if not line:\n", - " # Jika baris kosong → akhir kalimat\n", " if tokens:\n", " data.append({\n", " \"tokens\": tokens,\n", @@ -82,7 +81,15 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, + "id": "3793950a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "ac8eb374", "metadata": {}, "outputs": [], @@ -103,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 16, "id": "80356f1f", "metadata": {}, "outputs": [], @@ -130,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 17, "id": "fe219c96", "metadata": {}, "outputs": [], @@ -138,25 +145,25 @@ "X_train, X_test, y_ner_train, y_ner_test, y_srl_train, y_srl_test = train_test_split(\n", " X, y_ner, y_srl, \n", " test_size=0.20, \n", - " random_state=42, # supaya reproducible\n", - " shuffle=True # acak baris\n", + " random_state=42,\n", + " shuffle=True \n", ")" ] }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 18, "id": "7a9636b6", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Model: \"functional_13\"\n", + "Model: \"functional_1\"\n", "
\n" ], "text/plain": [ - "\u001b[1mModel: \"functional_13\"\u001b[0m\n" + "\u001b[1mModel: \"functional_1\"\u001b[0m\n" ] }, "metadata": {}, @@ -168,19 +175,19 @@ "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", "┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃\n", "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", - "│ input_layer_13 │ (None, 50) │ 0 │ - │\n", + "│ input_layer_1 │ (None, 50) │ 0 │ - │\n", "│ (InputLayer) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ embedding_13 │ (None, 50, 64) │ 44,544 │ input_layer_13[0… │\n", + "│ embedding_1 │ (None, 50, 64) │ 44,544 │ input_layer_1[0]… │\n", "│ (Embedding) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ bidirectional_13 │ (None, 50, 128) │ 66,048 │ embedding_13[0][… │\n", + "│ bidirectional_1 │ (None, 50, 128) │ 66,048 │ embedding_1[0][0] │\n", "│ (Bidirectional) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ ner_output │ (None, 50, 25) │ 3,225 │ bidirectional_13… │\n", + "│ ner_output │ (None, 50, 25) │ 3,225 │ bidirectional_1[… │\n", "│ (TimeDistributed) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ srl_output │ (None, 50, 20) │ 2,580 │ bidirectional_13… │\n", + "│ srl_output │ (None, 50, 18) │ 2,322 │ bidirectional_1[… │\n", "│ (TimeDistributed) │ │ │ │\n", "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n", "\n" @@ -189,19 +196,19 @@ "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", - "│ input_layer_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", + "│ input_layer_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ embedding_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_13[\u001b[38;5;34m0\u001b[0m… │\n", + "│ embedding_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_1[\u001b[38;5;34m0\u001b[0m]… │\n", "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ bidirectional_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_13[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", + "│ bidirectional_1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n", "│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_13… │\n", + "│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_1[\u001b[38;5;34m…\u001b[0m │\n", "│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m2,580\u001b[0m │ bidirectional_13… │\n", + "│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m18\u001b[0m) │ \u001b[38;5;34m2,322\u001b[0m │ bidirectional_1[\u001b[38;5;34m…\u001b[0m │\n", "│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n", "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n" ] @@ -212,11 +219,11 @@ { "data": { "text/html": [ - "Total params: 116,397 (454.68 KB)\n", + "Total params: 116,139 (453.67 KB)\n", "\n" ], "text/plain": [ - "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n" + "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,139\u001b[0m (453.67 KB)\n" ] }, "metadata": {}, @@ -225,11 +232,11 @@ { "data": { "text/html": [ - "Trainable params: 116,397 (454.68 KB)\n", + "Trainable params: 116,139 (453.67 KB)\n", "\n" ], "text/plain": [ - "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n" + "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,139\u001b[0m (453.67 KB)\n" ] }, "metadata": {}, @@ -253,25 +260,25 @@ "output_type": "stream", "text": [ "Epoch 1/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.3010 - ner_output_accuracy: 0.8807 - ner_output_loss: 1.5617 - srl_output_accuracy: 0.7456 - srl_output_loss: 1.7393 - val_loss: 0.7284 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2466 - val_srl_output_accuracy: 0.8300 - val_srl_output_loss: 0.4818\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.2850 - ner_output_accuracy: 0.8700 - ner_output_loss: 1.6767 - srl_output_accuracy: 0.7518 - srl_output_loss: 1.6083 - val_loss: 0.7275 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2555 - val_srl_output_accuracy: 0.8450 - val_srl_output_loss: 0.4720\n", "Epoch 2/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7355 - ner_output_accuracy: 0.9569 - ner_output_loss: 0.2279 - srl_output_accuracy: 0.8297 - srl_output_loss: 0.5076 - val_loss: 0.6655 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2323 - val_srl_output_accuracy: 0.8506 - val_srl_output_loss: 0.4332\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7622 - ner_output_accuracy: 0.9528 - ner_output_loss: 0.2458 - srl_output_accuracy: 0.8296 - srl_output_loss: 0.5163 - val_loss: 0.6534 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2296 - val_srl_output_accuracy: 0.8531 - val_srl_output_loss: 0.4238\n", "Epoch 3/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7041 - ner_output_accuracy: 0.9522 - ner_output_loss: 0.2219 - srl_output_accuracy: 0.8488 - srl_output_loss: 0.4822 - val_loss: 0.6368 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2232 - val_srl_output_accuracy: 0.8744 - val_srl_output_loss: 0.4135\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6875 - ner_output_accuracy: 0.9572 - ner_output_loss: 0.2126 - srl_output_accuracy: 0.8496 - srl_output_loss: 0.4750 - val_loss: 0.6327 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2273 - val_srl_output_accuracy: 0.8688 - val_srl_output_loss: 0.4054\n", "Epoch 4/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6864 - ner_output_accuracy: 0.9520 - ner_output_loss: 0.2184 - srl_output_accuracy: 0.8548 - srl_output_loss: 0.4680 - val_loss: 0.6078 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2193 - val_srl_output_accuracy: 0.8769 - val_srl_output_loss: 0.3885\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.6103 - ner_output_accuracy: 0.9533 - ner_output_loss: 0.2114 - srl_output_accuracy: 0.8772 - srl_output_loss: 0.3988 - val_loss: 0.6009 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2137 - val_srl_output_accuracy: 0.8662 - val_srl_output_loss: 0.3872\n", "Epoch 5/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6304 - ner_output_accuracy: 0.9545 - ner_output_loss: 0.2009 - srl_output_accuracy: 0.8675 - srl_output_loss: 0.4295 - val_loss: 0.5727 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2015 - val_srl_output_accuracy: 0.8812 - val_srl_output_loss: 0.3711\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6757 - ner_output_accuracy: 0.9486 - ner_output_loss: 0.2281 - srl_output_accuracy: 0.8582 - srl_output_loss: 0.4476 - val_loss: 0.5690 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2040 - val_srl_output_accuracy: 0.8781 - val_srl_output_loss: 0.3650\n", "Epoch 6/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5679 - ner_output_accuracy: 0.9557 - ner_output_loss: 0.1749 - srl_output_accuracy: 0.8783 - srl_output_loss: 0.3930 - val_loss: 0.5471 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1956 - val_srl_output_accuracy: 0.8831 - val_srl_output_loss: 0.3515\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5864 - ner_output_accuracy: 0.9477 - ner_output_loss: 0.2198 - srl_output_accuracy: 0.8898 - srl_output_loss: 0.3666 - val_loss: 0.5458 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1961 - val_srl_output_accuracy: 0.8875 - val_srl_output_loss: 0.3497\n", "Epoch 7/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5000 - ner_output_accuracy: 0.9587 - ner_output_loss: 0.1634 - srl_output_accuracy: 0.8917 - srl_output_loss: 0.3366 - val_loss: 0.5364 - val_ner_output_accuracy: 0.9513 - val_ner_output_loss: 0.1899 - val_srl_output_accuracy: 0.8850 - val_srl_output_loss: 0.3465\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5877 - ner_output_accuracy: 0.9506 - ner_output_loss: 0.1914 - srl_output_accuracy: 0.8773 - srl_output_loss: 0.3963 - val_loss: 0.5260 - val_ner_output_accuracy: 0.9525 - val_ner_output_loss: 0.1898 - val_srl_output_accuracy: 0.8875 - val_srl_output_loss: 0.3362\n", "Epoch 8/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5526 - ner_output_accuracy: 0.9541 - ner_output_loss: 0.1791 - srl_output_accuracy: 0.8840 - srl_output_loss: 0.3735 - val_loss: 0.5054 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1799 - val_srl_output_accuracy: 0.8963 - val_srl_output_loss: 0.3256\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5046 - ner_output_accuracy: 0.9536 - ner_output_loss: 0.1756 - srl_output_accuracy: 0.8912 - srl_output_loss: 0.3290 - val_loss: 0.5094 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1829 - val_srl_output_accuracy: 0.8881 - val_srl_output_loss: 0.3265\n", "Epoch 9/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5094 - ner_output_accuracy: 0.9561 - ner_output_loss: 0.1701 - srl_output_accuracy: 0.8915 - srl_output_loss: 0.3393 - val_loss: 0.4881 - val_ner_output_accuracy: 0.9512 - val_ner_output_loss: 0.1707 - val_srl_output_accuracy: 0.9013 - val_srl_output_loss: 0.3174\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4807 - ner_output_accuracy: 0.9539 - ner_output_loss: 0.1704 - srl_output_accuracy: 0.9021 - srl_output_loss: 0.3103 - val_loss: 0.4876 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1719 - val_srl_output_accuracy: 0.9025 - val_srl_output_loss: 0.3156\n", "Epoch 10/10\n", - "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4633 - ner_output_accuracy: 0.9524 - ner_output_loss: 0.1675 - srl_output_accuracy: 0.9092 - srl_output_loss: 0.2959 - val_loss: 0.4804 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1597 - val_srl_output_accuracy: 0.9050 - val_srl_output_loss: 0.3206\n" + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4134 - ner_output_accuracy: 0.9634 - ner_output_loss: 0.1350 - srl_output_accuracy: 0.9245 - srl_output_loss: 0.2784 - val_loss: 0.4587 - val_ner_output_accuracy: 0.9550 - val_ner_output_loss: 0.1598 - val_srl_output_accuracy: 0.9087 - val_srl_output_loss: 0.2989\n" ] } ], @@ -317,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 19, "id": "3a55990b", "metadata": {}, "outputs": [ @@ -325,23 +332,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'loss': 0.48035523295402527, 'compile_metrics': 0.15973526239395142, 'ner_output_loss': 0.32061997056007385, 'srl_output_loss': 0.953125}\n", + "{'loss': 0.45865434408187866, 'compile_metrics': 0.159775510430336, 'ner_output_loss': 0.29887881875038147, 'srl_output_loss': 0.9550000429153442}\n", + "{0: 'B-DATE', 1: 'B-ETH', 2: 'B-EVENT', 3: 'B-LOC', 4: 'B-MIN', 5: 'B-MISC', 6: 'B-ORG', 7: 'B-PER', 8: 'B-QUANT', 9: 'B-REL', 10: 'B-RES', 11: 'B-TERM', 12: 'B-TIME', 13: 'I-DATE', 14: 'I-ETH', 15: 'I-EVENT', 16: 'I-LOC', 17: 'I-MISC', 18: 'I-ORG', 19: 'I-PER', 20: 'I-QUANT', 21: 'I-RES', 22: 'I-TERM', 23: 'I-TIME', 24: 'O'}\n", "\n", "📊 [NER] Classification Report (test set):\n", " precision recall f1-score support\n", "\n", - " DATE 0.25 0.12 0.17 8\n", + " DATE 0.33 0.12 0.18 8\n", " EVENT 0.00 0.00 0.00 1\n", - " LOC 0.50 0.04 0.07 28\n", + " LOC 1.00 0.04 0.07 28\n", " ORG 0.00 0.00 0.00 4\n", " PER 0.00 0.00 0.00 2\n", - " TIME 0.20 0.10 0.13 10\n", + " TIME 0.50 0.30 0.37 10\n", "\n", - " micro avg 0.27 0.06 0.09 53\n", - " macro avg 0.16 0.04 0.06 53\n", - "weighted avg 0.34 0.06 0.09 53\n", + " micro avg 0.50 0.09 0.16 53\n", + " macro avg 0.31 0.08 0.10 53\n", + "weighted avg 0.67 0.09 0.13 53\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] } ], "source": [ @@ -375,7 +391,7 @@ "y_pred_ner, y_pred_srl = model.predict(X_test, verbose=0)\n", "\n", "true_ner, pred_ner = decode(y_pred_ner, y_ner_test, idx2tag_ner)\n", - "\n", + "print(idx2tag_ner)\n", "print(\"\\n📊 [NER] Classification Report (test set):\")\n", "print(classification_report(true_ner, pred_ner, digits=2))\n", "\n", @@ -400,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 20, "id": "547d1533", "metadata": {}, "outputs": [ @@ -408,28 +424,53 @@ "name": "stdout", "output_type": "stream", "text": [ - "{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PNC', 13: 'ARGM-PRD', 14: 'ARGM-PRP', 15: 'ARGM-SRC', 16: 'ARGM-TMP', 17: 'O', 18: 'R-ARG1', 19: 'V'}\n", + "{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PRP', 13: 'ARGM-SRC', 14: 'ARGM-TMP', 15: 'O', 16: 'R-ARG1', 17: 'V'}\n", "\n", "📊 [SRL] Classification Report (test set):\n", " precision recall f1-score support\n", "\n", " CAU 0.00 0.00 0.00 1\n", " FRQ 0.00 0.00 0.00 1\n", - " LOC 0.36 0.40 0.38 10\n", + " LOC 0.31 0.50 0.38 10\n", " MNR 0.00 0.00 0.00 4\n", - " PNC 0.00 0.00 0.00 1\n", " PRP 0.00 0.00 0.00 1\n", - " RG0 0.31 0.21 0.25 19\n", - " RG1 0.21 0.15 0.17 46\n", - " RG2 0.19 0.40 0.26 10\n", - " TMP 0.41 0.53 0.46 17\n", - " _ 0.10 0.06 0.07 33\n", + " RG0 0.50 0.11 0.17 19\n", + " RG1 0.18 0.20 0.19 46\n", + " RG2 0.27 0.40 0.32 10\n", + " TMP 0.50 0.59 0.54 17\n", + " _ 0.12 0.03 0.05 33\n", "\n", - " micro avg 0.25 0.21 0.23 143\n", - " macro avg 0.14 0.16 0.15 143\n", - "weighted avg 0.22 0.21 0.21 143\n", + " micro avg 0.28 0.22 0.24 142\n", + " macro avg 0.19 0.18 0.17 142\n", + "weighted avg 0.26 0.22 0.21 142\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG1 seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: V seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-TMP seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG0 seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-LOC seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-MNR seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-FRQ seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARG2 seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-PRP seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: ARGM-CAU seems not to be NE tag.\n", + " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n" + ] } ], "source": [ diff --git a/NER_SRL/tag2idx_srl.pkl b/NER_SRL/tag2idx_srl.pkl index 4d7390e..0d09d56 100644 Binary files a/NER_SRL/tag2idx_srl.pkl and b/NER_SRL/tag2idx_srl.pkl differ diff --git a/QC/question_generation_train.py b/QC/question_generation_train.py new file mode 100644 index 0000000..ed95d29 --- /dev/null +++ b/QC/question_generation_train.py @@ -0,0 +1,270 @@ +""" +qg_pipeline_static.py +~~~~~~~~~~~~~~~~~~~~~ +Question Generation Encoder‑Decoder LSTM +dengan fitur simbolik NER & SRL (pipeline statis). + +Datasets: + – train.jsonl / valid.jsonl (lihat format di fungsi `load_jsonl`) +""" + +import json, random, numpy as np, tensorflow as tf +from collections import Counter +from pathlib import Path +from sklearn.model_selection import train_test_split + +# ------------------------------------------------------------------------------ +# 1. UTILITAS DASAR +# ------------------------------------------------------------------------------ + +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +tf.random.set_seed(SEED) + +TRAIN_FILE = "../dataset/dataset_qc.json" +VALID_RATIO = 0.10 +MAX_CTX_LEN = 50 +MAX_Q_LEN = 30 +WORD_EMB_DIM = 128 +BATCH = 32 +EPOCHS = 15 + +SPECIALS_WORD = ("", " ", " ", " ") +SPECIALS_TAG = (" ",) + + +def load_jsonl(path): + """Muatt satu file JSON‑Lines. Setiap line = dict.""" + records = [] + with open(path, encoding="utf-8") as f: + for line in f: + obj = json.loads(line) + records.append(obj) + return records + + +def build_vocab(list_of_seq, specials): + """Bangun (token->id, id->token) dict dari kumpulan sekuens.""" + counter = Counter(tok for seq in list_of_seq for tok in seq) + itos = list(specials) + [tok for tok, _ in counter.most_common()] + stoi = {tok: i for i, tok in enumerate(itos)} + return stoi, itos + + +def encode(seq, tbl, max_len): + ids = [tbl.get(tok, tbl[" "]) for tok in seq] + return (ids + [tbl[" "]] * max_len)[:max_len] + + +# ------------------------------------------------------------------------------ +# 2. DATA PREP +# ------------------------------------------------------------------------------ + + +# def prepare_training_data(file_path): +# """Load → build vocab → encode ke numpy array.""" +# recs = load_jsonl(file_path) + +# ctx, ner, srl, ques = [], [], [], [] +# for r in recs: +# ctx.append(r["context_tokens"]) +# ner.append(r["ner_tags"]) +# srl.append(r["srl_tags"]) +# # tambahkan , +# ques.append([" "] + r["question_tokens"] + [" "]) + +# # 2.1 vocab +# w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2]) # , +# w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD) # 4 specials +# t2i_ner, _ = build_vocab(ner, SPECIALS_TAG) +# t2i_srl, _ = build_vocab(srl, SPECIALS_TAG) + +# # 2.2 encode & pad +# X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx]) +# X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner]) +# X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl]) + +# Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques]) # bos..last-1 +# Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques]) # 2..eos + +# return ( +# X_tok, +# X_ner, +# X_srl, +# Y_in, +# Y_out, +# w2i_ctx, +# i2w_ctx, +# w2i_q, +# i2w_q, +# t2i_ner, +# t2i_srl, +# ) + + +# --- ganti fungsi lama --- +def prepare_training_data(file_path): + recs = load_jsonl(file_path) + + ctx, ner, srl, ques, span_st, span_ed = [], [], [], [], [], [] + for r in recs: + tokens = r["tokens"] + ctx.append(tokens) # context_tokens + + ner.append(r["ner"]) + srl.append(r["srl"]) + + # --- hitung answer_span otomatis --- + ans_toks = r["answer"].split() + try: + start = next( + i + for i in range(len(tokens)) + if tokens[i : i + len(ans_toks)] == ans_toks + ) + end = start + len(ans_toks) - 1 + except StopIteration: + raise ValueError( + f"Jawaban '{r['answer']}' tidak cocok dengan tokens {tokens}" + ) + span_st.append(start) + span_ed.append(end) + + # question tokens: tokenisasi sederhana + ques.append([" "] + r["question"].split() + [" "]) + + # ---------- build vocab sama persis ---------- + w2i_ctx, i2w_ctx = build_vocab(ctx, SPECIALS_WORD[:2]) + w2i_q, i2w_q = build_vocab(ques, SPECIALS_WORD) + t2i_ner, _ = build_vocab(ner, SPECIALS_TAG) + t2i_srl, _ = build_vocab(srl, SPECIALS_TAG) + + # ---------- encode ---------- + X_tok = np.array([encode(s, w2i_ctx, MAX_CTX_LEN) for s in ctx]) + X_ner = np.array([encode(s, t2i_ner, MAX_CTX_LEN) for s in ner]) + X_srl = np.array([encode(s, t2i_srl, MAX_CTX_LEN) for s in srl]) + + Y_in = np.array([encode(s[:-1], w2i_q, MAX_Q_LEN) for s in ques]) + Y_out = np.array([encode(s[1:], w2i_q, MAX_Q_LEN) for s in ques]) + + # simpan span bila nanti mau copy‑mechanism + spans = np.array(list(zip(span_st, span_ed))) # (N, 2) + + return ( + X_tok, + X_ner, + X_srl, + Y_in, + Y_out, + spans, + w2i_ctx, + i2w_ctx, + w2i_q, + i2w_q, + t2i_ner, + t2i_srl, + ) + + +print("> Loading dataset …") + +(X_tok, X_ner, X_srl, Y_in, Y_out, w2i_ctx, i2w_ctx, w2i_q, i2w_q, t2i_ner, t2i_srl) = ( + prepare_training_data(TRAIN_FILE) +) + +train_idx, valid_idx = train_test_split( + np.arange(len(X_tok)), test_size=VALID_RATIO, random_state=SEED +) + + +def pick(arr, idx): + return arr[idx] + + +train_data = [pick(a, train_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)] +valid_data = [pick(a, valid_idx) for a in (X_tok, X_ner, X_srl, Y_in, Y_out)] + +# ------------------------------------------------------------------------------ +# 3. MODEL +# ------------------------------------------------------------------------------ + + +def build_model(vocab_ctx, vocab_q, n_ner, n_srl): + tok_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="tok") + ner_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="ner") + srl_in = tf.keras.layers.Input((MAX_CTX_LEN,), name="srl") + dec_in = tf.keras.layers.Input((MAX_Q_LEN,), name="dec") + + tok_emb = tf.keras.layers.Embedding(vocab_ctx, WORD_EMB_DIM, mask_zero=True)(tok_in) + ner_emb = tf.keras.layers.Embedding(n_ner, 32, mask_zero=True)(ner_in) + srl_emb = tf.keras.layers.Embedding(n_srl, 32, mask_zero=True)(srl_in) + + enc_in = tf.keras.layers.Concatenate()([tok_emb, ner_emb, srl_emb]) + enc_out, fwd_h, fwd_c, bwd_h, bwd_c = tf.keras.layers.Bidirectional( + tf.keras.layers.LSTM(WORD_EMB_DIM, return_sequences=True, return_state=True) + )(enc_in) + + state_h = tf.keras.layers.Concatenate()([fwd_h, bwd_h]) + state_c = tf.keras.layers.Concatenate()([fwd_c, bwd_c]) + + dec_emb = tf.keras.layers.Embedding(vocab_q, WORD_EMB_DIM, mask_zero=True)(dec_in) + dec_lstm = tf.keras.layers.LSTM( + WORD_EMB_DIM * 2, return_sequences=True, return_state=True + ) + dec_out, _, _ = dec_lstm(dec_emb, initial_state=[state_h, state_c]) + + # Attention (dot) + score = tf.keras.layers.Dot(axes=[2, 2])([dec_out, enc_out]) + attn_weights = tf.keras.layers.Activation("softmax")(score) + context_vec = tf.keras.layers.Dot(axes=[2, 1])([attn_weights, enc_out]) + + dec_cat = tf.keras.layers.Concatenate()([dec_out, context_vec]) + outputs = tf.keras.layers.TimeDistributed( + tf.keras.layers.Dense(vocab_q, activation="softmax") + )(dec_cat) + + mdl = tf.keras.Model([tok_in, ner_in, srl_in, dec_in], outputs) + mdl.compile( + optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] + ) + return mdl + + +print("> Building model …") +model = build_model(len(w2i_ctx), len(w2i_q), len(t2i_ner), len(t2i_srl)) +model.summary(line_length=120) + +# ------------------------------------------------------------------------------ +# 4. DATA GENERATOR +# ------------------------------------------------------------------------------ + + +def generator(data, batch=BATCH): + X_tok, X_ner, X_srl, Y_inp, Y_outp = data + n = len(X_tok) + while True: + idx = np.random.permutation(n) + for i in range(0, n, batch): + b = idx[i : i + batch] + yield [X_tok[b], X_ner[b], X_srl[b], Y_inp[b]], Y_outp[b][..., None] + + +steps_train = len(train_idx) // BATCH +steps_valid = len(valid_idx) // BATCH + +# ------------------------------------------------------------------------------ +# 5. TRAIN +# ------------------------------------------------------------------------------ + +print("> Training …") +_ = model.fit( + generator(train_data), + steps_per_epoch=steps_train, + validation_data=generator(valid_data), + validation_steps=steps_valid, + epochs=EPOCHS, +) + +model.save("qg_lstm_static.h5") +print("✓ Model saved to qg_lstm_static.h5") diff --git a/QC/test_model_qc.py b/QC/test_model_qc.py new file mode 100644 index 0000000..f36c74f --- /dev/null +++ b/QC/test_model_qc.py @@ -0,0 +1,58 @@ +MAX_CTX_LEN = 50 + + +# -- dummy placeholder untuk model NER/SRL Anda ------------------------------- +def predict_ner(tokens): # ganti sesuai implementasi + return ["O"] * len(tokens) + + +def predict_srl(tokens): # ganti sesuai implementasi + return ["O"] * len(tokens) + + +# ------------------------------------------------------------------------------ + + +def greedy_decode(context_tokens): + """Menghasilkan satu pertanyaan (greedy).""" + # 6.1 Tagging + ner_tags = predict_ner(context_tokens) + srl_tags = predict_srl(context_tokens) + + # 6.2 Encode + ctx_ids = encode(context_tokens, w2i_ctx, MAX_CTX_LEN)[None] + ner_ids = encode(ner_tags, t2i_ner, MAX_CTX_LEN)[None] + srl_ids = encode(srl_tags, t2i_srl, MAX_CTX_LEN)[None] + + dec_seq = [w2i_q[" "]] + for _ in range(MAX_Q_LEN - 1): + dec_pad = dec_seq + [w2i_q[" "]] * (MAX_Q_LEN - len(dec_seq)) + pred = model.predict( + [ctx_ids, ner_ids, srl_ids, np.array([dec_pad])], verbose=0 + ) + next_id = int(pred[0, len(dec_seq) - 1].argmax()) + if i2w_q[next_id] == " ": + break + dec_seq.append(next_id) + + tokens_q = [i2w_q[t] for t in dec_seq[1:]] + return " ".join(tokens_q) + + +if __name__ == "__main__": + sample = [ + "Keberagaman", + "potensi", + "sumber", + "daya", + "alam", + "Indonesia", + "tidak", + "lepas", + "dari", + "proses", + "geografis", + ".", + ] + print("\n[CTX]", " ".join(sample)) + print("[Q] ", greedy_decode(sample)) diff --git a/dataset/dataset _qc.json b/dataset/dataset _qc.json index 6af892a..61de299 100644 --- a/dataset/dataset _qc.json +++ b/dataset/dataset _qc.json @@ -1,8 +1,136 @@ -{ - "tokens": ["Barack", "Obama", "lahir", "di", "Hawaii", "."], - "ner": ["B-PER", "I-PER", "O", "O", "B-LOC", "O"], - "srl": ["B-ARG0", "I-ARG0", "B-V", "B-ARGM-LOC", "I-ARGM-LOC", "O"], - "question": "___ lahir di Hawaii.", - "answer": "Barack Obama", - "type": "isian" -} +[ + { + "tokens": [ + "R.", + "Soewardi", + "Soerjaningrat", + "adalah", + "putra", + "GPH", + "Soerjaningrat", + "dan", + "cucu", + "Pakualam", + "III", + "." + ], + "ner": [ + "B-PER", + "I-PER", + "I-PER", + "O", + "O", + "B-PER", + "I-PER", + "O", + "O", + "B-PER", + "I-PER", + "O" + ], + "srl": [ + "ARG0", + "ARG0", + "ARG0", + "V", + "ARG1", + "ARG1", + "ARG1", + "ARG1", + "ARG1", + "ARG1", + "ARG1", + "O" + ], + "question": "___ adalah putra GPH Soerjaningrat dan cucu Pakualam III.", + "answer": "R. Soewardi Soerjaningrat", + "type": "isian" + }, + { + "tokens": ["Ia", "lantas", "diterima", "belajar", "di", "STOVIA", "."], + "ner": ["O", "O", "O", "O", "O", "B-ORG", "O"], + "srl": ["ARG0", "O", "V", "ARG1", "O", "ARGM-LOC", "O"], + "question": "Ia diterima belajar di ___.", + "answer": "STOVIA", + "type": "isian" + }, + { + "tokens": [ + "Ia", + "bersama", + "Douwes", + "Dekker", + "dan", + "dr.", + "Cipto", + "Mangoenkoesoemo", + "lantas", + "mendirikan", + "Indische", + "Partij", + "pada", + "25", + "Desember", + "1912", + "." + ], + "ner": [ + "O", + "O", + "B-PER", + "I-PER", + "O", + "B-PER", + "I-PER", + "I-PER", + "O", + "O", + "B-ORG", + "I-ORG", + "O", + "B-DATE", + "I-DATE", + "I-DATE", + "O" + ], + "srl": [ + "ARG0", + "ARG0", + "ARG0", + "ARG0", + "ARG0", + "ARG0", + "ARG0", + "ARG0", + "O", + "V", + "ARG1", + "ARG1", + "O", + "ARGM-TMP", + "ARGM-TMP", + "ARGM-TMP", + "O" + ], + "question": "Ia bersama Douwes Dekker dan dr. Cipto Mangoenkoesoemo lantas mendirikan ___ pada 25 Desember 1912.", + "answer": "Indische Partij", + "type": "isian" + }, + { + "tokens": [ + "Indische", + "Partij", + "didirikan", + "pada", + "25", + "Desember", + "1912", + "." + ], + "ner": ["B-ORG", "I-ORG", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"], + "srl": ["ARG1", "ARG1", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O"], + "question": "Indische Partij didirikan pada tanggal ___.", + "answer": "25 Desember 1912", + "type": "isian" + } +] diff --git a/dataset/dataset_ner_srl.tsv b/dataset/dataset_ner_srl.tsv index 70f227a..598d9ba 100644 --- a/dataset/dataset_ner_srl.tsv +++ b/dataset/dataset_ner_srl.tsv @@ -2009,4 +2009,44 @@ memasak O V nasi O ARG1 di O O dapur B-LOC ARGM-LOC -. O O +. O O + +R. B=PER ARG0 +Soewardi I-PER ARG0 +Soerjaningrat I-PER ARG0 +adalah O V +putra O ARG1 +GPH B-PER ARG1 +Soerjaningrat I-PER ARG1 +dan O ARG1 +cucu O ARG1 +Pakualam B-PER ARG1 +III I-PER ARG1 +. + +Ia O ARG0 +bersama O ARG0 +Douwes B-PER ARG0 +Dekker I-PER ARG0 +dan O ARG0 +dr. B-PER ARG0 +Cipto I-PER ARG0 +Mangoenkoesoemo I-PER ARG0 +lantas O O +mendirikan O V +Indische B-ORG ARG1 +Partij I-ORG ARG1 +pada O O +25 B-DATE ARGM-TMP +Desember I-DATE ARGM-TMP +1912 I-DATE ARGM-TMP +. O O + +Indische B-ORG ARG1 +Partij I-ORG ARG1 +didirikan O V +pada O O +25 B-DATE ARGM-TMP +Desember I-DATE ARGM-TMP +1912 I-DATE ARGM-TMP +. O O \ No newline at end of file