diff --git a/NER_SRL/multi_task_bilstm_model.keras b/NER_SRL/multi_task_bilstm_model.keras index cc9d560..27cc11e 100644 Binary files a/NER_SRL/multi_task_bilstm_model.keras and b/NER_SRL/multi_task_bilstm_model.keras differ diff --git a/NER_SRL/multi_task_lstm_ner_srl_model.keras b/NER_SRL/multi_task_lstm_ner_srl_model.keras new file mode 100644 index 0000000..6058696 Binary files /dev/null and b/NER_SRL/multi_task_lstm_ner_srl_model.keras differ diff --git a/NER_SRL/new_lstm_ner_srl.ipynb b/NER_SRL/new_lstm_ner_srl.ipynb new file mode 100644 index 0000000..3a3fbcb --- /dev/null +++ b/NER_SRL/new_lstm_ner_srl.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 118, + "id": "fb106e20", + "metadata": {}, + "outputs": [], + "source": [ + "import json, pickle\n", + "import numpy as np\n", + "from keras.models import Model\n", + "from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense\n", + "from keras.preprocessing.sequence import pad_sequences\n", + "from keras.utils import to_categorical\n", + "from seqeval.metrics import classification_report\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "00347a5f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total kalimat: 156\n", + "Total token: 1850\n" + ] + } + ], + "source": [ + "#load the data\n", + "# with open(\"../dataset/dataset_ner_srl.json\", encoding=\"utf-8\") as f:\n", + "# data = json.load(f)\n", + "\n", + "# sentences = [[tok.lower() for tok in item[\"tokens\"]] for item in data]\n", + "# labels_ner = [item[\"labels_ner\"] for item in data]\n", + "# labels_srl = [item[\"labels_srl\"] for item in data]\n", + "\n", + "# for i, label_seq in enumerate(labels_ner):\n", + "# if \"V\" in label_seq:\n", + "# print(f\"Label 'V' ditemukan di index {i}: {label_seq}\")\n", + " \n", + " \n", + "data = []\n", + "\n", + "with open(\"../dataset/dataset_ner_srl.tsv\", encoding=\"utf-8\") as f:\n", + " tokens, ner_labels, srl_labels = [], [], []\n", + " \n", + " for line in f:\n", + " line = line.strip()\n", + " if not line:\n", + " # Jika baris kosong → akhir kalimat\n", + " if tokens:\n", + " data.append({\n", + " \"tokens\": tokens,\n", + " \"labels_ner\": ner_labels,\n", + " \"labels_srl\": srl_labels\n", + " })\n", + " tokens, ner_labels, srl_labels = [], [], []\n", + " else:\n", + " token, ner, srl = line.split(\"\\t\")\n", + " tokens.append(token)\n", + " ner_labels.append(ner)\n", + " srl_labels.append(srl)\n", + "\n", + "# Preprocessing sama seperti sebelumnya\n", + "sentences = [[tok.lower() for tok in item[\"tokens\"]] for item in data]\n", + "labels_ner = [item[\"labels_ner\"] for item in data]\n", + "labels_srl = [item[\"labels_srl\"] for item in data]\n", + "\n", + "total_kalimat = len(data)\n", + "total_token = sum(len(item[\"tokens\"]) for item in data)\n", + "\n", + "print(\"Total kalimat:\", total_kalimat)\n", + "print(\"Total token:\", total_token)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "ac8eb374", + "metadata": {}, + "outputs": [], + "source": [ + "# tagging \n", + "words = sorted({w for s in sentences for w in s})\n", + "ner_tags = sorted({t for seq in labels_ner for t in seq})\n", + "srl_tags = sorted({t for seq in labels_srl for t in seq})\n", + "\n", + "word2idx = {w: i + 2 for i, w in enumerate(words)}\n", + "word2idx[\"PAD\"], word2idx[\"UNK\"] = 0, 1\n", + "\n", + "tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}\n", + "tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}\n", + "idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}\n", + "idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "80356f1f", + "metadata": {}, + "outputs": [], + "source": [ + "# encoding\n", + "\n", + "X = [[word2idx.get(w, word2idx[\"UNK\"]) for w in s] for s in sentences]\n", + "y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]\n", + "y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]\n", + "\n", + "maxlen = 50 \n", + "\n", + "X = pad_sequences(X, maxlen=maxlen, padding=\"post\", value=word2idx[\"PAD\"])\n", + "y_ner = pad_sequences(y_ner, maxlen=maxlen, padding=\"post\", value=tag2idx_ner[\"O\"])\n", + "y_srl = pad_sequences(y_srl, maxlen=maxlen, padding=\"post\", value=tag2idx_srl[\"O\"])\n", + "\n", + "y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]\n", + "y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]\n", + "\n", + "X = np.array(X)\n", + "y_ner = np.array(y_ner)\n", + "y_srl = np.array(y_srl)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "fe219c96", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_ner_train, y_ner_test, y_srl_train, y_srl_test = train_test_split(\n", + " X, y_ner, y_srl, \n", + " test_size=0.20, \n", + " random_state=42, # supaya reproducible\n", + " shuffle=True # acak baris\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "7a9636b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Model: \"functional_13\"\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1mModel: \"functional_13\"\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", + "┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", + "│ input_layer_13 │ (None, 50) │ 0 │ - │\n", + "│ (InputLayer) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ embedding_13 │ (None, 50, 64) │ 44,544 │ input_layer_13[0… │\n", + "│ (Embedding) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ bidirectional_13 │ (None, 50, 128) │ 66,048 │ embedding_13[0][… │\n", + "│ (Bidirectional) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ ner_output │ (None, 50, 25) │ 3,225 │ bidirectional_13… │\n", + "│ (TimeDistributed) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ srl_output │ (None, 50, 20) │ 2,580 │ bidirectional_13… │\n", + "│ (TimeDistributed) │ │ │ │\n", + "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n", + "\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", + "│ input_layer_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", + "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ embedding_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_13[\u001b[38;5;34m0\u001b[0m… │\n", + "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ bidirectional_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_13[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", + "│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_13… │\n", + "│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m2,580\u001b[0m │ bidirectional_13… │\n", + "│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n", + "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Total params: 116,397 (454.68 KB)\n", + "\n" + ], + "text/plain": [ + "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Trainable params: 116,397 (454.68 KB)\n", + "\n" + ], + "text/plain": [ + "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Non-trainable params: 0 (0.00 B)\n", + "\n" + ], + "text/plain": [ + "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.3010 - ner_output_accuracy: 0.8807 - ner_output_loss: 1.5617 - srl_output_accuracy: 0.7456 - srl_output_loss: 1.7393 - val_loss: 0.7284 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2466 - val_srl_output_accuracy: 0.8300 - val_srl_output_loss: 0.4818\n", + "Epoch 2/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7355 - ner_output_accuracy: 0.9569 - ner_output_loss: 0.2279 - srl_output_accuracy: 0.8297 - srl_output_loss: 0.5076 - val_loss: 0.6655 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2323 - val_srl_output_accuracy: 0.8506 - val_srl_output_loss: 0.4332\n", + "Epoch 3/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7041 - ner_output_accuracy: 0.9522 - ner_output_loss: 0.2219 - srl_output_accuracy: 0.8488 - srl_output_loss: 0.4822 - val_loss: 0.6368 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2232 - val_srl_output_accuracy: 0.8744 - val_srl_output_loss: 0.4135\n", + "Epoch 4/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6864 - ner_output_accuracy: 0.9520 - ner_output_loss: 0.2184 - srl_output_accuracy: 0.8548 - srl_output_loss: 0.4680 - val_loss: 0.6078 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2193 - val_srl_output_accuracy: 0.8769 - val_srl_output_loss: 0.3885\n", + "Epoch 5/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6304 - ner_output_accuracy: 0.9545 - ner_output_loss: 0.2009 - srl_output_accuracy: 0.8675 - srl_output_loss: 0.4295 - val_loss: 0.5727 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2015 - val_srl_output_accuracy: 0.8812 - val_srl_output_loss: 0.3711\n", + "Epoch 6/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5679 - ner_output_accuracy: 0.9557 - ner_output_loss: 0.1749 - srl_output_accuracy: 0.8783 - srl_output_loss: 0.3930 - val_loss: 0.5471 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1956 - val_srl_output_accuracy: 0.8831 - val_srl_output_loss: 0.3515\n", + "Epoch 7/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5000 - ner_output_accuracy: 0.9587 - ner_output_loss: 0.1634 - srl_output_accuracy: 0.8917 - srl_output_loss: 0.3366 - val_loss: 0.5364 - val_ner_output_accuracy: 0.9513 - val_ner_output_loss: 0.1899 - val_srl_output_accuracy: 0.8850 - val_srl_output_loss: 0.3465\n", + "Epoch 8/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5526 - ner_output_accuracy: 0.9541 - ner_output_loss: 0.1791 - srl_output_accuracy: 0.8840 - srl_output_loss: 0.3735 - val_loss: 0.5054 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1799 - val_srl_output_accuracy: 0.8963 - val_srl_output_loss: 0.3256\n", + "Epoch 9/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5094 - ner_output_accuracy: 0.9561 - ner_output_loss: 0.1701 - srl_output_accuracy: 0.8915 - srl_output_loss: 0.3393 - val_loss: 0.4881 - val_ner_output_accuracy: 0.9512 - val_ner_output_loss: 0.1707 - val_srl_output_accuracy: 0.9013 - val_srl_output_loss: 0.3174\n", + "Epoch 10/10\n", + "\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4633 - ner_output_accuracy: 0.9524 - ner_output_loss: 0.1675 - srl_output_accuracy: 0.9092 - srl_output_loss: 0.2959 - val_loss: 0.4804 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1597 - val_srl_output_accuracy: 0.9050 - val_srl_output_loss: 0.3206\n" + ] + } + ], + "source": [ + "input_layer = Input(shape=(maxlen,))\n", + "embed = Embedding(len(word2idx), 64)(input_layer)\n", + "bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)\n", + "\n", + "ner_output = TimeDistributed(\n", + " Dense(len(tag2idx_ner), activation=\"softmax\"), name=\"ner_output\"\n", + ")(bilstm)\n", + "srl_output = TimeDistributed(\n", + " Dense(len(tag2idx_srl), activation=\"softmax\"), name=\"srl_output\"\n", + ")(bilstm)\n", + "\n", + "model = Model(inputs=input_layer, outputs=[ner_output, srl_output])\n", + "model.compile(\n", + " optimizer=\"adam\",\n", + " loss={\n", + " \"ner_output\": \"categorical_crossentropy\",\n", + " \"srl_output\": \"categorical_crossentropy\",\n", + " },\n", + " metrics={\"ner_output\": \"accuracy\", \"srl_output\": \"accuracy\"},\n", + ")\n", + "model.summary()\n", + "model.fit(\n", + " X_train, {\"ner_output\": y_ner_train, \"srl_output\": y_srl_train}, \n", + " validation_data=(X_test, {\"ner_output\": y_ner_test, \"srl_output\": y_srl_test}),\n", + " batch_size=2,\n", + " epochs=10,\n", + " verbose=1\n", + ")\n", + "\n", + "# ---------- 6. Simpan artefak ----------\n", + "model.save(\"multi_task_lstm_ner_srl_model.keras\")\n", + "with open(\"word2idx.pkl\", \"wb\") as f:\n", + " pickle.dump(word2idx, f)\n", + "with open(\"tag2idx_ner.pkl\", \"wb\") as f:\n", + " pickle.dump(tag2idx_ner, f)\n", + "with open(\"tag2idx_srl.pkl\", \"wb\") as f:\n", + " pickle.dump(tag2idx_srl, f)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "3a55990b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'loss': 0.48035523295402527, 'compile_metrics': 0.15973526239395142, 'ner_output_loss': 0.32061997056007385, 'srl_output_loss': 0.953125}\n", + "\n", + "📊 [NER] Classification Report (test set):\n", + " precision recall f1-score support\n", + "\n", + " DATE 0.25 0.12 0.17 8\n", + " EVENT 0.00 0.00 0.00 1\n", + " LOC 0.50 0.04 0.07 28\n", + " ORG 0.00 0.00 0.00 4\n", + " PER 0.00 0.00 0.00 2\n", + " TIME 0.20 0.10 0.13 10\n", + "\n", + " micro avg 0.27 0.06 0.09 53\n", + " macro avg 0.16 0.04 0.06 53\n", + "weighted avg 0.34 0.06 0.09 53\n", + "\n" + ] + } + ], + "source": [ + "# evaluation\n", + "\n", + "results = model.evaluate(\n", + " X_test,\n", + " {\"ner_output\": y_ner_test, \"srl_output\": y_srl_test},\n", + " verbose=0\n", + ")\n", + "\n", + "# `metrics_names` = [\"loss\",\n", + "# \"ner_output_loss\", \"srl_output_loss\",\n", + "# \"ner_output_accuracy\", \"srl_output_accuracy\"]\n", + "print(dict(zip(model.metrics_names, results)))\n", + "\n", + "def decode(pred, true, idx2tag):\n", + " out_true, out_pred = [], []\n", + " for p_seq, t_seq in zip(pred, true):\n", + " t_labels, p_labels = [], []\n", + " for p_tok, t_tok in zip(p_seq, t_seq):\n", + " if t_tok.sum() == 0: # token PAD → lewati\n", + " continue\n", + " t_labels.append(idx2tag[t_tok.argmax()])\n", + " p_labels.append(idx2tag[p_tok.argmax()])\n", + " out_true.append(t_labels)\n", + " out_pred.append(p_labels)\n", + " return out_true, out_pred\n", + "\n", + "# prediksi hanya pada test set\n", + "y_pred_ner, y_pred_srl = model.predict(X_test, verbose=0)\n", + "\n", + "true_ner, pred_ner = decode(y_pred_ner, y_ner_test, idx2tag_ner)\n", + "\n", + "print(\"\\n📊 [NER] Classification Report (test set):\")\n", + "print(classification_report(true_ner, pred_ner, digits=2))\n", + "\n", + "\n", + "\n", + "\n", + "# y_pred_ner, y_pred_srl = model.predict(X, verbose=0)\n", + "\n", + "\n", + "# def decode(pred, true, idx2tag):\n", + "# true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]\n", + "# pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]\n", + "# return true_tags, pred_tags\n", + "\n", + "\n", + "# true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)\n", + "\n", + "# print(\"\\n📊 [NER] Classification Report:\")\n", + "# print(classification_report(true_ner, pred_ner))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "547d1533", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PNC', 13: 'ARGM-PRD', 14: 'ARGM-PRP', 15: 'ARGM-SRC', 16: 'ARGM-TMP', 17: 'O', 18: 'R-ARG1', 19: 'V'}\n", + "\n", + "📊 [SRL] Classification Report (test set):\n", + " precision recall f1-score support\n", + "\n", + " CAU 0.00 0.00 0.00 1\n", + " FRQ 0.00 0.00 0.00 1\n", + " LOC 0.36 0.40 0.38 10\n", + " MNR 0.00 0.00 0.00 4\n", + " PNC 0.00 0.00 0.00 1\n", + " PRP 0.00 0.00 0.00 1\n", + " RG0 0.31 0.21 0.25 19\n", + " RG1 0.21 0.15 0.17 46\n", + " RG2 0.19 0.40 0.26 10\n", + " TMP 0.41 0.53 0.46 17\n", + " _ 0.10 0.06 0.07 33\n", + "\n", + " micro avg 0.25 0.21 0.23 143\n", + " macro avg 0.14 0.16 0.15 143\n", + "weighted avg 0.22 0.21 0.21 143\n", + "\n" + ] + } + ], + "source": [ + "# true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)\n", + "# print(\"\\n📊 [SRL] Classification Report:\")\n", + "# print(classification_report(true_srl, pred_srl))\n", + "\n", + "true_srl, pred_srl = decode(y_pred_srl, y_srl_test, idx2tag_srl)\n", + "print(idx2tag_srl)\n", + "print(\"\\n📊 [SRL] Classification Report (test set):\")\n", + "print(classification_report(true_srl, pred_srl, digits=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/NER_SRL/tag2idx_ner.pkl b/NER_SRL/tag2idx_ner.pkl index d80508f..0981659 100644 Binary files a/NER_SRL/tag2idx_ner.pkl and b/NER_SRL/tag2idx_ner.pkl differ diff --git a/NER_SRL/tag2idx_srl.pkl b/NER_SRL/tag2idx_srl.pkl index 9468de9..4d7390e 100644 Binary files a/NER_SRL/tag2idx_srl.pkl and b/NER_SRL/tag2idx_srl.pkl differ diff --git a/NER_SRL/test_model.py b/NER_SRL/test_model.py index 55c879c..e5d22e4 100644 --- a/NER_SRL/test_model.py +++ b/NER_SRL/test_model.py @@ -6,7 +6,7 @@ from keras.models import load_model from keras.preprocessing.sequence import pad_sequences -model = load_model("multi_task_bilstm_model.keras") +model = load_model("multi_task_lstm_ner_srl_model.keras") with open("word2idx.pkl", "rb") as f: word2idx = pickle.load(f) @@ -28,6 +28,7 @@ max = 50 def predict_sentence(sentence): tokens = sentence.strip().lower().split() print(tokens) + x = [word2idx.get(w.lower(), word2idx["UNK"]) for w in tokens] x = pad_sequences([x], maxlen=50, padding="post", value=word2idx["PAD"]) @@ -35,18 +36,22 @@ def predict_sentence(sentence): pred_labels_ner = np.argmax(preds[0], axis=-1)[0] pred_labels_srl = np.argmax(preds[1], axis=-1)[0] - print("Hasil prediksi NER:") - for token, label_idx in zip(tokens, pred_labels_ner[: len(tokens)]): - print(f"{token}\t{idx2tag_ner[int(label_idx)]}") + result = { + "tokens": tokens, + "labels_ner": [ + idx2tag_ner[int(label)] for label in pred_labels_ner[: len(tokens)] + ], + "labels_srl": [ + idx2tag_srl[int(label)] for label in pred_labels_srl[: len(tokens)] + ], + } - print("\nHasil prediksi SRL:") - for token, label_idx in zip(tokens, pred_labels_srl[: len(tokens)]): - print(f"{token}\t{idx2tag_srl[int(label_idx)]}") + return result if __name__ == "__main__": try: - sentence = "aku lahir di indonesia" - predict_sentence(sentence) + sentence = "sore ini aku pergi ke indonesia" + print(predict_sentence(sentence)) except KeyboardInterrupt: print("\n\nSelesai.") diff --git a/NER_SRL/word2idx.pkl b/NER_SRL/word2idx.pkl index fae6223..7d4857e 100644 Binary files a/NER_SRL/word2idx.pkl and b/NER_SRL/word2idx.pkl differ