{ "cells": [ { "cell_type": "code", "execution_count": 41, "id": "fb106e20", "metadata": {}, "outputs": [], "source": [ "import json, pickle\n", "import numpy as np\n", "from keras.models import Model\n", "from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.utils import to_categorical\n", "from seqeval.metrics import classification_report\n", "from sklearn.model_selection import train_test_split\n", "from tensorflow.keras.metrics import CategoricalAccuracy\n", "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", "import matplotlib.pyplot as plt\n" ] }, { "cell_type": "code", "execution_count": 42, "id": "00347a5f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total kalimat: 159\n", "Total token: 1891\n" ] } ], "source": [ "\n", "data = []\n", "\n", "with open(\"../dataset/dataset_ner_srl.tsv\", encoding=\"utf-8\") as f:\n", " tokens, ner_labels, srl_labels = [], [], []\n", " \n", " for line in f:\n", " line = line.strip()\n", " if not line:\n", " if tokens:\n", " data.append({\n", " \"tokens\": tokens,\n", " \"labels_ner\": ner_labels,\n", " \"labels_srl\": srl_labels\n", " })\n", " tokens, ner_labels, srl_labels = [], [], []\n", " else:\n", " token, ner, srl = line.split(\"\\t\")\n", " tokens.append(token)\n", " ner_labels.append(ner)\n", " srl_labels.append(srl)\n", "\n", "# Preprocessing sama seperti sebelumnya\n", "sentences = [[tok.lower() for tok in item[\"tokens\"]] for item in data]\n", "labels_ner = [item[\"labels_ner\"] for item in data]\n", "labels_srl = [item[\"labels_srl\"] for item in data]\n", "\n", "total_kalimat = len(data)\n", "total_token = sum(len(item[\"tokens\"]) for item in data)\n", "\n", "print(\"Total kalimat:\", total_kalimat)\n", "print(\"Total token:\", total_token)" ] }, { "cell_type": "code", "execution_count": null, "id": "3793950a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 43, "id": "ac8eb374", "metadata": {}, "outputs": [], "source": [ "# tagging \n", "words = sorted({w for s in sentences for w in s})\n", "ner_tags = sorted({t for seq in labels_ner for t in seq})\n", "srl_tags = sorted({t for seq in labels_srl for t in seq})\n", "\n", "word2idx = {w: i + 2 for i, w in enumerate(words)}\n", "word2idx[\"PAD\"], word2idx[\"UNK\"] = 0, 1\n", "\n", "tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}\n", "tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}\n", "idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}\n", "idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}" ] }, { "cell_type": "code", "execution_count": 44, "id": "80356f1f", "metadata": {}, "outputs": [], "source": [ "# encoding\n", "\n", "X = [[word2idx.get(w, word2idx[\"UNK\"]) for w in s] for s in sentences]\n", "y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]\n", "y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]\n", "\n", "maxlen = 50 \n", "\n", "X = pad_sequences(X, maxlen=maxlen, padding=\"post\", value=word2idx[\"PAD\"])\n", "y_ner = pad_sequences(y_ner, maxlen=maxlen, padding=\"post\", value=tag2idx_ner[\"O\"])\n", "y_srl = pad_sequences(y_srl, maxlen=maxlen, padding=\"post\", value=tag2idx_srl[\"O\"])\n", "\n", "y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]\n", "y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]\n", "\n", "X = np.array(X)\n", "y_ner = np.array(y_ner)\n", "y_srl = np.array(y_srl)" ] }, { "cell_type": "code", "execution_count": 45, "id": "fe219c96", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_ner_train, y_ner_test, y_srl_train, y_srl_test = train_test_split(\n", " X, y_ner, y_srl, \n", " test_size=0.20, \n", " random_state=42,\n", " shuffle=True \n", ")" ] }, { "cell_type": "code", "execution_count": 46, "id": "7a9636b6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Model: \"functional_4\"\n",
"
\n"
],
"text/plain": [
"\u001b[1mModel: \"functional_4\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", "┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃\n", "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", "│ input_layer_4 │ (None, 50) │ 0 │ - │\n", "│ (InputLayer) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ embedding_4 │ (None, 50, 64) │ 46,016 │ input_layer_4[0]… │\n", "│ (Embedding) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ bidirectional_4 │ (None, 50, 128) │ 66,048 │ embedding_4[0][0] │\n", "│ (Bidirectional) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ ner_output │ (None, 50, 25) │ 3,225 │ bidirectional_4[… │\n", "│ (TimeDistributed) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ srl_output │ (None, 50, 18) │ 2,322 │ bidirectional_4[… │\n", "│ (TimeDistributed) │ │ │ │\n", "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n", "\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", "│ input_layer_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ embedding_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m46,016\u001b[0m │ input_layer_4[\u001b[38;5;34m0\u001b[0m]… │\n", "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ bidirectional_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_4[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n", "│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_4[\u001b[38;5;34m…\u001b[0m │\n", "│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n", "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", "│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m18\u001b[0m) │ \u001b[38;5;34m2,322\u001b[0m │ bidirectional_4[\u001b[38;5;34m…\u001b[0m │\n", "│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n", "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Total params: 117,611 (459.42 KB)\n", "\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m117,611\u001b[0m (459.42 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Trainable params: 117,611 (459.42 KB)\n", "\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m117,611\u001b[0m (459.42 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Non-trainable params: 0 (0.00 B)\n", "\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 31ms/step - loss: 3.2183 - ner_output_accuracy: 0.8700 - ner_output_loss: 1.5503 - srl_output_accuracy: 0.7278 - srl_output_loss: 1.6679 - val_loss: 0.7435 - val_ner_output_accuracy: 0.9488 - val_ner_output_loss: 0.2671 - val_srl_output_accuracy: 0.8413 - val_srl_output_loss: 0.4764\n", "Epoch 2/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 17ms/step - loss: 0.7484 - ner_output_accuracy: 0.9476 - ner_output_loss: 0.2574 - srl_output_accuracy: 0.8292 - srl_output_loss: 0.4911 - val_loss: 0.6928 - val_ner_output_accuracy: 0.9488 - val_ner_output_loss: 0.2566 - val_srl_output_accuracy: 0.8538 - val_srl_output_loss: 0.4362\n", "Epoch 3/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 17ms/step - loss: 0.7115 - ner_output_accuracy: 0.9501 - ner_output_loss: 0.2363 - srl_output_accuracy: 0.8392 - srl_output_loss: 0.4751 - val_loss: 0.6501 - val_ner_output_accuracy: 0.9488 - val_ner_output_loss: 0.2476 - val_srl_output_accuracy: 0.8556 - val_srl_output_loss: 0.4026\n", "Epoch 4/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 17ms/step - loss: 0.6596 - ner_output_accuracy: 0.9499 - ner_output_loss: 0.2149 - srl_output_accuracy: 0.8501 - srl_output_loss: 0.4448 - val_loss: 0.6068 - val_ner_output_accuracy: 0.9488 - val_ner_output_loss: 0.2321 - val_srl_output_accuracy: 0.8888 - val_srl_output_loss: 0.3746\n", "Epoch 5/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 18ms/step - loss: 0.6708 - ner_output_accuracy: 0.9500 - ner_output_loss: 0.2116 - srl_output_accuracy: 0.8565 - srl_output_loss: 0.4591 - val_loss: 0.5745 - val_ner_output_accuracy: 0.9488 - val_ner_output_loss: 0.2257 - val_srl_output_accuracy: 0.8969 - val_srl_output_loss: 0.3489\n", "Epoch 6/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 0.5632 - ner_output_accuracy: 0.9518 - ner_output_loss: 0.1934 - srl_output_accuracy: 0.8837 - srl_output_loss: 0.3702 - val_loss: 0.5507 - val_ner_output_accuracy: 0.9488 - val_ner_output_loss: 0.2125 - val_srl_output_accuracy: 0.8981 - val_srl_output_loss: 0.3382\n", "Epoch 7/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 0.5163 - ner_output_accuracy: 0.9542 - ner_output_loss: 0.1730 - srl_output_accuracy: 0.8909 - srl_output_loss: 0.3432 - val_loss: 0.5265 - val_ner_output_accuracy: 0.9494 - val_ner_output_loss: 0.2101 - val_srl_output_accuracy: 0.9031 - val_srl_output_loss: 0.3165\n", "Epoch 8/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 0.4597 - ner_output_accuracy: 0.9560 - ner_output_loss: 0.1609 - srl_output_accuracy: 0.9077 - srl_output_loss: 0.2989 - val_loss: 0.5069 - val_ner_output_accuracy: 0.9506 - val_ner_output_loss: 0.1971 - val_srl_output_accuracy: 0.9063 - val_srl_output_loss: 0.3097\n", "Epoch 9/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 16ms/step - loss: 0.4826 - ner_output_accuracy: 0.9573 - ner_output_loss: 0.1635 - srl_output_accuracy: 0.9079 - srl_output_loss: 0.3192 - val_loss: 0.4902 - val_ner_output_accuracy: 0.9506 - val_ner_output_loss: 0.1912 - val_srl_output_accuracy: 0.9125 - val_srl_output_loss: 0.2990\n", "Epoch 10/10\n", "\u001b[1m64/64\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 17ms/step - loss: 0.4266 - ner_output_accuracy: 0.9513 - ner_output_loss: 0.1734 - srl_output_accuracy: 0.9242 - srl_output_loss: 0.2531 - val_loss: 0.4749 - val_ner_output_accuracy: 0.9550 - val_ner_output_loss: 0.1855 - val_srl_output_accuracy: 0.9138 - val_srl_output_loss: 0.2895\n" ] } ], "source": [ "input_layer = Input(shape=(maxlen,))\n", "embed = Embedding(len(word2idx), 64)(input_layer)\n", "bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)\n", "\n", "ner_output = TimeDistributed(\n", " Dense(len(tag2idx_ner), activation=\"softmax\"), name=\"ner_output\"\n", ")(bilstm)\n", "srl_output = TimeDistributed(\n", " Dense(len(tag2idx_srl), activation=\"softmax\"), name=\"srl_output\"\n", ")(bilstm)\n", "\n", "model = Model(inputs=input_layer, outputs=[ner_output, srl_output])\n", "model.compile(\n", " optimizer=\"adam\",\n", " loss={\n", " \"ner_output\": \"categorical_crossentropy\",\n", " \"srl_output\": \"categorical_crossentropy\",\n", " },\n", " metrics={\n", " \"ner_output\": [CategoricalAccuracy(name=\"accuracy\")],\n", " \"srl_output\": [CategoricalAccuracy(name=\"accuracy\")],\n", " },\n", ")\n", "\n", "model.summary()\n", "model.fit(\n", " X_train, {\"ner_output\": y_ner_train, \"srl_output\": y_srl_train}, \n", " validation_data=(X_test, {\"ner_output\": y_ner_test, \"srl_output\": y_srl_test}),\n", " batch_size=2,\n", " epochs=10,\n", " verbose=1\n", ")\n", "\n", "# ---------- 6. Simpan artefak ----------\n", "model.save(\"multi_task_lstm_ner_srl_model.keras\")\n", "with open(\"word2idx.pkl\", \"wb\") as f:\n", " pickle.dump(word2idx, f)\n", "with open(\"tag2idx_ner.pkl\", \"wb\") as f:\n", " pickle.dump(tag2idx_ner, f)\n", "with open(\"tag2idx_srl.pkl\", \"wb\") as f:\n", " pickle.dump(tag2idx_srl, f)\n" ] }, { "cell_type": "code", "execution_count": 47, "id": "3a55990b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Metrics names: ['loss', 'compile_metrics', 'ner_output_loss', 'srl_output_loss']\n", "loss: 0.47491562366485596\n", "compile_metrics: 0.18545177578926086\n", "ner_output_loss: 0.2894638478755951\n", "srl_output_loss: 0.9550000429153442\n", "WARNING:tensorflow:5 out of the last 8 calls to