469 lines
27 KiB
Plaintext
469 lines
27 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 118,
|
|
"id": "fb106e20",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json, pickle\n",
|
|
"import numpy as np\n",
|
|
"from keras.models import Model\n",
|
|
"from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense\n",
|
|
"from keras.preprocessing.sequence import pad_sequences\n",
|
|
"from keras.utils import to_categorical\n",
|
|
"from seqeval.metrics import classification_report\n",
|
|
"from sklearn.model_selection import train_test_split"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 119,
|
|
"id": "00347a5f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Total kalimat: 156\n",
|
|
"Total token: 1850\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#load the data\n",
|
|
"# with open(\"../dataset/dataset_ner_srl.json\", encoding=\"utf-8\") as f:\n",
|
|
"# data = json.load(f)\n",
|
|
"\n",
|
|
"# sentences = [[tok.lower() for tok in item[\"tokens\"]] for item in data]\n",
|
|
"# labels_ner = [item[\"labels_ner\"] for item in data]\n",
|
|
"# labels_srl = [item[\"labels_srl\"] for item in data]\n",
|
|
"\n",
|
|
"# for i, label_seq in enumerate(labels_ner):\n",
|
|
"# if \"V\" in label_seq:\n",
|
|
"# print(f\"Label 'V' ditemukan di index {i}: {label_seq}\")\n",
|
|
" \n",
|
|
" \n",
|
|
"data = []\n",
|
|
"\n",
|
|
"with open(\"../dataset/dataset_ner_srl.tsv\", encoding=\"utf-8\") as f:\n",
|
|
" tokens, ner_labels, srl_labels = [], [], []\n",
|
|
" \n",
|
|
" for line in f:\n",
|
|
" line = line.strip()\n",
|
|
" if not line:\n",
|
|
" # Jika baris kosong → akhir kalimat\n",
|
|
" if tokens:\n",
|
|
" data.append({\n",
|
|
" \"tokens\": tokens,\n",
|
|
" \"labels_ner\": ner_labels,\n",
|
|
" \"labels_srl\": srl_labels\n",
|
|
" })\n",
|
|
" tokens, ner_labels, srl_labels = [], [], []\n",
|
|
" else:\n",
|
|
" token, ner, srl = line.split(\"\\t\")\n",
|
|
" tokens.append(token)\n",
|
|
" ner_labels.append(ner)\n",
|
|
" srl_labels.append(srl)\n",
|
|
"\n",
|
|
"# Preprocessing sama seperti sebelumnya\n",
|
|
"sentences = [[tok.lower() for tok in item[\"tokens\"]] for item in data]\n",
|
|
"labels_ner = [item[\"labels_ner\"] for item in data]\n",
|
|
"labels_srl = [item[\"labels_srl\"] for item in data]\n",
|
|
"\n",
|
|
"total_kalimat = len(data)\n",
|
|
"total_token = sum(len(item[\"tokens\"]) for item in data)\n",
|
|
"\n",
|
|
"print(\"Total kalimat:\", total_kalimat)\n",
|
|
"print(\"Total token:\", total_token)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 120,
|
|
"id": "ac8eb374",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# tagging \n",
|
|
"words = sorted({w for s in sentences for w in s})\n",
|
|
"ner_tags = sorted({t for seq in labels_ner for t in seq})\n",
|
|
"srl_tags = sorted({t for seq in labels_srl for t in seq})\n",
|
|
"\n",
|
|
"word2idx = {w: i + 2 for i, w in enumerate(words)}\n",
|
|
"word2idx[\"PAD\"], word2idx[\"UNK\"] = 0, 1\n",
|
|
"\n",
|
|
"tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}\n",
|
|
"tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}\n",
|
|
"idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}\n",
|
|
"idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 121,
|
|
"id": "80356f1f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# encoding\n",
|
|
"\n",
|
|
"X = [[word2idx.get(w, word2idx[\"UNK\"]) for w in s] for s in sentences]\n",
|
|
"y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]\n",
|
|
"y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]\n",
|
|
"\n",
|
|
"maxlen = 50 \n",
|
|
"\n",
|
|
"X = pad_sequences(X, maxlen=maxlen, padding=\"post\", value=word2idx[\"PAD\"])\n",
|
|
"y_ner = pad_sequences(y_ner, maxlen=maxlen, padding=\"post\", value=tag2idx_ner[\"O\"])\n",
|
|
"y_srl = pad_sequences(y_srl, maxlen=maxlen, padding=\"post\", value=tag2idx_srl[\"O\"])\n",
|
|
"\n",
|
|
"y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]\n",
|
|
"y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]\n",
|
|
"\n",
|
|
"X = np.array(X)\n",
|
|
"y_ner = np.array(y_ner)\n",
|
|
"y_srl = np.array(y_srl)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 122,
|
|
"id": "fe219c96",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train, X_test, y_ner_train, y_ner_test, y_srl_train, y_srl_test = train_test_split(\n",
|
|
" X, y_ner, y_srl, \n",
|
|
" test_size=0.20, \n",
|
|
" random_state=42, # supaya reproducible\n",
|
|
" shuffle=True # acak baris\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 123,
|
|
"id": "7a9636b6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional_13\"</span>\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1mModel: \"functional_13\"\u001b[0m\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
|
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
|
|
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
|
"│ input_layer_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
|
|
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ embedding_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">44,544</span> │ input_layer_13[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
|
|
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ bidirectional_13 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">66,048</span> │ embedding_13[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
|
|
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Bidirectional</span>) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ ner_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">25</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">3,225</span> │ bidirectional_13… │\n",
|
|
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ srl_output │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">50</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">20</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,580</span> │ bidirectional_13… │\n",
|
|
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>) │ │ │ │\n",
|
|
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
|
|
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
|
|
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
|
|
"│ input_layer_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
|
|
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ embedding_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m44,544\u001b[0m │ input_layer_13[\u001b[38;5;34m0\u001b[0m… │\n",
|
|
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ bidirectional_13 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m66,048\u001b[0m │ embedding_13[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
|
|
"│ (\u001b[38;5;33mBidirectional\u001b[0m) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ ner_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m25\u001b[0m) │ \u001b[38;5;34m3,225\u001b[0m │ bidirectional_13… │\n",
|
|
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
|
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
|
|
"│ srl_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m50\u001b[0m, \u001b[38;5;34m20\u001b[0m) │ \u001b[38;5;34m2,580\u001b[0m │ bidirectional_13… │\n",
|
|
"│ (\u001b[38;5;33mTimeDistributed\u001b[0m) │ │ │ │\n",
|
|
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,397</span> (454.68 KB)\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">116,397</span> (454.68 KB)\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m116,397\u001b[0m (454.68 KB)\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
|
|
"</pre>\n"
|
|
],
|
|
"text/plain": [
|
|
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - loss: 3.3010 - ner_output_accuracy: 0.8807 - ner_output_loss: 1.5617 - srl_output_accuracy: 0.7456 - srl_output_loss: 1.7393 - val_loss: 0.7284 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2466 - val_srl_output_accuracy: 0.8300 - val_srl_output_loss: 0.4818\n",
|
|
"Epoch 2/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7355 - ner_output_accuracy: 0.9569 - ner_output_loss: 0.2279 - srl_output_accuracy: 0.8297 - srl_output_loss: 0.5076 - val_loss: 0.6655 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2323 - val_srl_output_accuracy: 0.8506 - val_srl_output_loss: 0.4332\n",
|
|
"Epoch 3/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.7041 - ner_output_accuracy: 0.9522 - ner_output_loss: 0.2219 - srl_output_accuracy: 0.8488 - srl_output_loss: 0.4822 - val_loss: 0.6368 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2232 - val_srl_output_accuracy: 0.8744 - val_srl_output_loss: 0.4135\n",
|
|
"Epoch 4/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6864 - ner_output_accuracy: 0.9520 - ner_output_loss: 0.2184 - srl_output_accuracy: 0.8548 - srl_output_loss: 0.4680 - val_loss: 0.6078 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2193 - val_srl_output_accuracy: 0.8769 - val_srl_output_loss: 0.3885\n",
|
|
"Epoch 5/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.6304 - ner_output_accuracy: 0.9545 - ner_output_loss: 0.2009 - srl_output_accuracy: 0.8675 - srl_output_loss: 0.4295 - val_loss: 0.5727 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.2015 - val_srl_output_accuracy: 0.8812 - val_srl_output_loss: 0.3711\n",
|
|
"Epoch 6/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 11ms/step - loss: 0.5679 - ner_output_accuracy: 0.9557 - ner_output_loss: 0.1749 - srl_output_accuracy: 0.8783 - srl_output_loss: 0.3930 - val_loss: 0.5471 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1956 - val_srl_output_accuracy: 0.8831 - val_srl_output_loss: 0.3515\n",
|
|
"Epoch 7/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5000 - ner_output_accuracy: 0.9587 - ner_output_loss: 0.1634 - srl_output_accuracy: 0.8917 - srl_output_loss: 0.3366 - val_loss: 0.5364 - val_ner_output_accuracy: 0.9513 - val_ner_output_loss: 0.1899 - val_srl_output_accuracy: 0.8850 - val_srl_output_loss: 0.3465\n",
|
|
"Epoch 8/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5526 - ner_output_accuracy: 0.9541 - ner_output_loss: 0.1791 - srl_output_accuracy: 0.8840 - srl_output_loss: 0.3735 - val_loss: 0.5054 - val_ner_output_accuracy: 0.9519 - val_ner_output_loss: 0.1799 - val_srl_output_accuracy: 0.8963 - val_srl_output_loss: 0.3256\n",
|
|
"Epoch 9/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.5094 - ner_output_accuracy: 0.9561 - ner_output_loss: 0.1701 - srl_output_accuracy: 0.8915 - srl_output_loss: 0.3393 - val_loss: 0.4881 - val_ner_output_accuracy: 0.9512 - val_ner_output_loss: 0.1707 - val_srl_output_accuracy: 0.9013 - val_srl_output_loss: 0.3174\n",
|
|
"Epoch 10/10\n",
|
|
"\u001b[1m62/62\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 10ms/step - loss: 0.4633 - ner_output_accuracy: 0.9524 - ner_output_loss: 0.1675 - srl_output_accuracy: 0.9092 - srl_output_loss: 0.2959 - val_loss: 0.4804 - val_ner_output_accuracy: 0.9531 - val_ner_output_loss: 0.1597 - val_srl_output_accuracy: 0.9050 - val_srl_output_loss: 0.3206\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"input_layer = Input(shape=(maxlen,))\n",
|
|
"embed = Embedding(len(word2idx), 64)(input_layer)\n",
|
|
"bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)\n",
|
|
"\n",
|
|
"ner_output = TimeDistributed(\n",
|
|
" Dense(len(tag2idx_ner), activation=\"softmax\"), name=\"ner_output\"\n",
|
|
")(bilstm)\n",
|
|
"srl_output = TimeDistributed(\n",
|
|
" Dense(len(tag2idx_srl), activation=\"softmax\"), name=\"srl_output\"\n",
|
|
")(bilstm)\n",
|
|
"\n",
|
|
"model = Model(inputs=input_layer, outputs=[ner_output, srl_output])\n",
|
|
"model.compile(\n",
|
|
" optimizer=\"adam\",\n",
|
|
" loss={\n",
|
|
" \"ner_output\": \"categorical_crossentropy\",\n",
|
|
" \"srl_output\": \"categorical_crossentropy\",\n",
|
|
" },\n",
|
|
" metrics={\"ner_output\": \"accuracy\", \"srl_output\": \"accuracy\"},\n",
|
|
")\n",
|
|
"model.summary()\n",
|
|
"model.fit(\n",
|
|
" X_train, {\"ner_output\": y_ner_train, \"srl_output\": y_srl_train}, \n",
|
|
" validation_data=(X_test, {\"ner_output\": y_ner_test, \"srl_output\": y_srl_test}),\n",
|
|
" batch_size=2,\n",
|
|
" epochs=10,\n",
|
|
" verbose=1\n",
|
|
")\n",
|
|
"\n",
|
|
"# ---------- 6. Simpan artefak ----------\n",
|
|
"model.save(\"multi_task_lstm_ner_srl_model.keras\")\n",
|
|
"with open(\"word2idx.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(word2idx, f)\n",
|
|
"with open(\"tag2idx_ner.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(tag2idx_ner, f)\n",
|
|
"with open(\"tag2idx_srl.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump(tag2idx_srl, f)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 124,
|
|
"id": "3a55990b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'loss': 0.48035523295402527, 'compile_metrics': 0.15973526239395142, 'ner_output_loss': 0.32061997056007385, 'srl_output_loss': 0.953125}\n",
|
|
"\n",
|
|
"📊 [NER] Classification Report (test set):\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" DATE 0.25 0.12 0.17 8\n",
|
|
" EVENT 0.00 0.00 0.00 1\n",
|
|
" LOC 0.50 0.04 0.07 28\n",
|
|
" ORG 0.00 0.00 0.00 4\n",
|
|
" PER 0.00 0.00 0.00 2\n",
|
|
" TIME 0.20 0.10 0.13 10\n",
|
|
"\n",
|
|
" micro avg 0.27 0.06 0.09 53\n",
|
|
" macro avg 0.16 0.04 0.06 53\n",
|
|
"weighted avg 0.34 0.06 0.09 53\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# evaluation\n",
|
|
"\n",
|
|
"results = model.evaluate(\n",
|
|
" X_test,\n",
|
|
" {\"ner_output\": y_ner_test, \"srl_output\": y_srl_test},\n",
|
|
" verbose=0\n",
|
|
")\n",
|
|
"\n",
|
|
"# `metrics_names` = [\"loss\",\n",
|
|
"# \"ner_output_loss\", \"srl_output_loss\",\n",
|
|
"# \"ner_output_accuracy\", \"srl_output_accuracy\"]\n",
|
|
"print(dict(zip(model.metrics_names, results)))\n",
|
|
"\n",
|
|
"def decode(pred, true, idx2tag):\n",
|
|
" out_true, out_pred = [], []\n",
|
|
" for p_seq, t_seq in zip(pred, true):\n",
|
|
" t_labels, p_labels = [], []\n",
|
|
" for p_tok, t_tok in zip(p_seq, t_seq):\n",
|
|
" if t_tok.sum() == 0: # token PAD → lewati\n",
|
|
" continue\n",
|
|
" t_labels.append(idx2tag[t_tok.argmax()])\n",
|
|
" p_labels.append(idx2tag[p_tok.argmax()])\n",
|
|
" out_true.append(t_labels)\n",
|
|
" out_pred.append(p_labels)\n",
|
|
" return out_true, out_pred\n",
|
|
"\n",
|
|
"# prediksi hanya pada test set\n",
|
|
"y_pred_ner, y_pred_srl = model.predict(X_test, verbose=0)\n",
|
|
"\n",
|
|
"true_ner, pred_ner = decode(y_pred_ner, y_ner_test, idx2tag_ner)\n",
|
|
"\n",
|
|
"print(\"\\n📊 [NER] Classification Report (test set):\")\n",
|
|
"print(classification_report(true_ner, pred_ner, digits=2))\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"# y_pred_ner, y_pred_srl = model.predict(X, verbose=0)\n",
|
|
"\n",
|
|
"\n",
|
|
"# def decode(pred, true, idx2tag):\n",
|
|
"# true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]\n",
|
|
"# pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]\n",
|
|
"# return true_tags, pred_tags\n",
|
|
"\n",
|
|
"\n",
|
|
"# true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)\n",
|
|
"\n",
|
|
"# print(\"\\n📊 [NER] Classification Report:\")\n",
|
|
"# print(classification_report(true_ner, pred_ner))\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 125,
|
|
"id": "547d1533",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{0: 'ARG0', 1: 'ARG1', 2: 'ARG2', 3: 'ARG3', 4: 'ARGM-BNF', 5: 'ARGM-CAU', 6: 'ARGM-COM', 7: 'ARGM-FRQ', 8: 'ARGM-LOC', 9: 'ARGM-MNR', 10: 'ARGM-MOD', 11: 'ARGM-NEG', 12: 'ARGM-PNC', 13: 'ARGM-PRD', 14: 'ARGM-PRP', 15: 'ARGM-SRC', 16: 'ARGM-TMP', 17: 'O', 18: 'R-ARG1', 19: 'V'}\n",
|
|
"\n",
|
|
"📊 [SRL] Classification Report (test set):\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" CAU 0.00 0.00 0.00 1\n",
|
|
" FRQ 0.00 0.00 0.00 1\n",
|
|
" LOC 0.36 0.40 0.38 10\n",
|
|
" MNR 0.00 0.00 0.00 4\n",
|
|
" PNC 0.00 0.00 0.00 1\n",
|
|
" PRP 0.00 0.00 0.00 1\n",
|
|
" RG0 0.31 0.21 0.25 19\n",
|
|
" RG1 0.21 0.15 0.17 46\n",
|
|
" RG2 0.19 0.40 0.26 10\n",
|
|
" TMP 0.41 0.53 0.46 17\n",
|
|
" _ 0.10 0.06 0.07 33\n",
|
|
"\n",
|
|
" micro avg 0.25 0.21 0.23 143\n",
|
|
" macro avg 0.14 0.16 0.15 143\n",
|
|
"weighted avg 0.22 0.21 0.21 143\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)\n",
|
|
"# print(\"\\n📊 [SRL] Classification Report:\")\n",
|
|
"# print(classification_report(true_srl, pred_srl))\n",
|
|
"\n",
|
|
"true_srl, pred_srl = decode(y_pred_srl, y_srl_test, idx2tag_srl)\n",
|
|
"print(idx2tag_srl)\n",
|
|
"print(\"\\n📊 [SRL] Classification Report (test set):\")\n",
|
|
"print(classification_report(true_srl, pred_srl, digits=2))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "myenv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|