fix: adjustment on the file management and the dataset
This commit is contained in:
parent
647505a8e2
commit
20ef6aeaed
|
@ -1,2 +1,7 @@
|
|||
myenv
|
||||
.keras
|
||||
# Abaikan direktori atau file yang mengandung 'keras' di mana saja dalam namanya
|
||||
*keras*
|
||||
**/*keras*
|
||||
|
||||
# Abaikan semua file dengan ekstensi .pkl
|
||||
*.pkl
|
||||
|
|
Binary file not shown.
BIN
NER/tag2idx.pkl
BIN
NER/tag2idx.pkl
Binary file not shown.
BIN
NER/word2idx.pkl
BIN
NER/word2idx.pkl
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 4,
|
||||
"id": "9bf2159a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -10,18 +10,18 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-04-29 19:13:19.968628: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
||||
"2025-04-29 19:13:19.975821: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-04-29 19:13:20.061422: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-04-29 19:13:20.109564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
||||
"2025-05-02 15:16:40.916818: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
||||
"2025-05-02 15:16:40.923426: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-05-02 15:16:40.983217: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
||||
"2025-05-02 15:16:41.024477: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
||||
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
||||
"E0000 00:00:1745928800.155971 272184 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
||||
"E0000 00:00:1745928800.168166 272184 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
||||
"W0000 00:00:1745928800.263286 272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1745928800.263312 272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1745928800.263313 272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1745928800.263314 272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"2025-04-29 19:13:20.274608: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
||||
"E0000 00:00:1746173801.069646 9825 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
||||
"E0000 00:00:1746173801.081087 9825 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
||||
"W0000 00:00:1746173801.169376 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746173801.169393 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746173801.169395 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"W0000 00:00:1746173801.169396 9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
||||
"2025-05-02 15:16:41.179508: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
||||
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
||||
]
|
||||
}
|
||||
|
@ -51,7 +51,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 5,
|
||||
"id": "50118278",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -60,9 +60,9 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
" Jumlah data valid: 287 / 287\n",
|
||||
" Jumlah data valid: 321 / 321\n",
|
||||
" Jumlah data tidak valid: 0\n",
|
||||
"Counter({'ftb': 202, 'tof': 45, 'none': 40})\n"
|
||||
"Counter({'ftb': 235, 'tof': 45, 'none': 41})\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -450,7 +450,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"id": "06fd86c7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -501,9 +501,7 @@
|
|||
"print(\"\\n=== Akurasi Detail ===\")\n",
|
||||
"print(f\"Question Accuracy (Token-level): {acc_q:.4f}\")\n",
|
||||
"print(f\"Answer Accuracy (Token-level) : {acc_a:.4f}\")\n",
|
||||
"print(f\"Type Accuracy (Class-level) : {np.mean(y_true_type == y_pred_type):.2f}\")\n",
|
||||
"# print(\"\\n=== Classification Report (TYPE) ===\")\n",
|
||||
"# print(report_type)"
|
||||
"print(f\"Type Accuracy (Class-level) : {np.mean(y_true_type == y_pred_type):.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 36 KiB |
Binary file not shown.
After Width: | Height: | Size: 61 KiB |
|
@ -0,0 +1,152 @@
|
|||
# ner_srl_multitask.py
|
||||
# ----------------------------------------------------------
|
||||
# Train a multi‑task (Bi)LSTM that predicts NER + SRL tags
|
||||
# ----------------------------------------------------------
|
||||
import json, numpy as np, tensorflow as tf
|
||||
from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional,
|
||||
TimeDistributed, Dense)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.utils import to_categorical
|
||||
from sklearn.model_selection import train_test_split
|
||||
from seqeval.metrics import classification_report
|
||||
# ----------------------------------------------------------
|
||||
# 1. Load and prepare data
|
||||
# ----------------------------------------------------------
|
||||
DATA = json.load(open("../dataset/dataset_ner_srl.json", "r", encoding="utf8"))
|
||||
|
||||
# --- token vocabulary -------------------------------------------------
|
||||
vocab = {"PAD": 0, "UNK": 1}
|
||||
for sample in DATA:
|
||||
for tok in sample["tokens"]:
|
||||
vocab.setdefault(tok.lower(), len(vocab))
|
||||
|
||||
# --- label maps -------------------------------------------------------
|
||||
def build_label_map(key):
|
||||
tags = {"PAD": 0} # keep 0 for padding
|
||||
for s in DATA:
|
||||
for t in s[key]:
|
||||
tags.setdefault(t, len(tags))
|
||||
return tags
|
||||
|
||||
ner2idx = build_label_map("labels_ner")
|
||||
srl2idx = build_label_map("labels_srl")
|
||||
idx2ner = {i: t for t, i in ner2idx.items()}
|
||||
idx2srl = {i: t for t, i in srl2idx.items()}
|
||||
|
||||
# --- sequences --------------------------------------------------------
|
||||
MAXLEN = max(len(x["tokens"]) for x in DATA)
|
||||
|
||||
X = [[vocab.get(tok.lower(), vocab["UNK"]) for tok in s["tokens"]]
|
||||
for s in DATA]
|
||||
y_ner = [[ner2idx[t] for t in s["labels_ner"]]
|
||||
for s in DATA]
|
||||
y_srl = [[srl2idx[t] for t in s["labels_srl"]]
|
||||
for s in DATA]
|
||||
|
||||
X = pad_sequences(X, maxlen=MAXLEN, padding="post", value=vocab["PAD"])
|
||||
y_ner = pad_sequences(y_ner, maxlen=MAXLEN, padding="post", value=ner2idx["PAD"])
|
||||
y_srl = pad_sequences(y_srl, maxlen=MAXLEN, padding="post", value=srl2idx["PAD"])
|
||||
|
||||
# --- one‑hot for softmax ---------------------------------------------
|
||||
y_ner = to_categorical(y_ner, num_classes=len(ner2idx))
|
||||
y_srl = to_categorical(y_srl, num_classes=len(srl2idx))
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 2. Train / validation split
|
||||
# ----------------------------------------------------------
|
||||
# *All* arrays must be passed to train_test_split in one call so they
|
||||
# stay aligned. Order‑of‑return = train,test for each array.
|
||||
X_tr, X_val, y_tr_ner, y_val_ner, y_tr_srl, y_val_srl = train_test_split(
|
||||
X, y_ner, y_srl, test_size=0.15, random_state=42
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 3. Model definition
|
||||
# ----------------------------------------------------------
|
||||
EMB_DIM = 128
|
||||
LSTM_UNITS = 128
|
||||
|
||||
inp = Input(shape=(MAXLEN,))
|
||||
emb = Embedding(len(vocab), EMB_DIM, mask_zero=True)(inp)
|
||||
bilstm= Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(emb)
|
||||
|
||||
ner_out = TimeDistributed(
|
||||
Dense(len(ner2idx), activation="softmax"), name="ner")(bilstm)
|
||||
srl_out = TimeDistributed(
|
||||
Dense(len(srl2idx), activation="softmax"), name="srl")(bilstm)
|
||||
|
||||
model = Model(inp, [ner_out, srl_out])
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss ={"ner": "categorical_crossentropy",
|
||||
"srl": "categorical_crossentropy"},
|
||||
metrics={"ner": "accuracy",
|
||||
"srl": "accuracy"}
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 4. Train
|
||||
# ----------------------------------------------------------
|
||||
history = model.fit(
|
||||
X_tr,
|
||||
{"ner": y_tr_ner, "srl": y_tr_srl},
|
||||
validation_data=(X_val, {"ner": y_val_ner, "srl": y_val_srl}),
|
||||
epochs=15,
|
||||
batch_size=32,
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 5. Helper: decode with a mask (so lens always match)
|
||||
# ----------------------------------------------------------
|
||||
def decode(pred, idx2tag, mask):
|
||||
"""
|
||||
pred : [n, MAXLEN, n_tags] (one‑hot or probabilities)
|
||||
mask : [n, MAXLEN] (True for real tokens, False for PAD)
|
||||
"""
|
||||
out = []
|
||||
for seq, m in zip(pred, mask):
|
||||
tags = [idx2tag[np.argmax(tok)] for tok, keep in zip(seq, m) if keep]
|
||||
out.append(tags)
|
||||
return out
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 6. Evaluation
|
||||
# ----------------------------------------------------------
|
||||
y_pred_ner, y_pred_srl = model.predict(X_val, verbose=0)
|
||||
|
||||
mask_val = (X_val != vocab["PAD"]) # True for real tokens
|
||||
|
||||
true_ner = decode(y_val_ner , idx2ner, mask_val)
|
||||
pred_ner = decode(y_pred_ner, idx2ner, mask_val)
|
||||
true_srl = decode(y_val_srl , idx2srl, mask_val)
|
||||
pred_srl = decode(y_pred_srl, idx2srl, mask_val)
|
||||
|
||||
print("\n📊 NER report")
|
||||
print(classification_report(true_ner, pred_ner))
|
||||
|
||||
print("\n📊 SRL report")
|
||||
print(classification_report(true_srl, pred_srl))
|
||||
|
||||
# # ----------------------------------------------------------
|
||||
# # 7. Quick inference function
|
||||
# # ----------------------------------------------------------
|
||||
# def predict_sentence(sentence: str):
|
||||
# tokens = sentence.strip().split()
|
||||
# ids = [vocab.get(w.lower(), vocab["UNK"]) for w in tokens]
|
||||
# ids = pad_sequences([ids], maxlen=MAXLEN, padding="post",
|
||||
# value=vocab["PAD"])
|
||||
# mask = (ids != vocab["PAD"])
|
||||
# p_ner, p_srl = model.predict(ids, verbose=0)
|
||||
# ner_tags = decode(p_ner , idx2ner , mask)[0]
|
||||
# srl_tags = decode(p_srl , idx2srl , mask)[0]
|
||||
# return list(zip(tokens, ner_tags, srl_tags))
|
||||
|
||||
# # ---- demo ------------------------------------------------
|
||||
# if __name__ == "__main__":
|
||||
# print("\n🔍 Demo:")
|
||||
# for tok, ner, srl in predict_sentence(
|
||||
# "Keanekaragaman hayati Indonesia sangat dipengaruhi faktor iklim."):
|
||||
# print(f"{tok:15} {ner:10} {srl}")
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,107 @@
|
|||
import json, pickle
|
||||
import numpy as np
|
||||
from keras.models import Model
|
||||
from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.utils import to_categorical
|
||||
from seqeval.metrics import classification_report
|
||||
|
||||
# ---------- 1. Muat data ----------
|
||||
with open("dataset/dataset_ner_srl.json", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
sentences = [[tok.lower() for tok in item["tokens"]] for item in data]
|
||||
labels_ner = [item["labels_ner"] for item in data]
|
||||
labels_srl = [item["labels_srl"] for item in data]
|
||||
|
||||
for i, label_seq in enumerate(labels_ner):
|
||||
if "V" in label_seq:
|
||||
print(f"Label 'V' ditemukan di index {i}: {label_seq}")
|
||||
|
||||
# ---------- 2. Bangun vocab & label map ----------
|
||||
words = sorted({w for s in sentences for w in s})
|
||||
ner_tags = sorted({t for seq in labels_ner for t in seq})
|
||||
srl_tags = sorted({t for seq in labels_srl for t in seq})
|
||||
|
||||
word2idx = {w: i + 2 for i, w in enumerate(words)}
|
||||
word2idx["PAD"], word2idx["UNK"] = 0, 1
|
||||
|
||||
tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}
|
||||
tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}
|
||||
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
|
||||
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
|
||||
|
||||
# ---------- 3. Encoding token & label ----------
|
||||
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
|
||||
y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]
|
||||
y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]
|
||||
|
||||
maxlen = max(len(seq) for seq in X)
|
||||
|
||||
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
|
||||
y_ner = pad_sequences(y_ner, maxlen=maxlen, padding="post", value=tag2idx_ner["O"])
|
||||
y_srl = pad_sequences(y_srl, maxlen=maxlen, padding="post", value=tag2idx_srl["O"])
|
||||
|
||||
y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]
|
||||
y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]
|
||||
|
||||
# cast ke np.array biar Keras happy
|
||||
X = np.array(X)
|
||||
y_ner = np.array(y_ner)
|
||||
y_srl = np.array(y_srl)
|
||||
|
||||
# ---------- 4. Arsitektur BiLSTM multi‑task ----------
|
||||
input_layer = Input(shape=(maxlen,))
|
||||
embed = Embedding(len(word2idx), 64)(input_layer)
|
||||
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)
|
||||
|
||||
ner_output = TimeDistributed(
|
||||
Dense(len(tag2idx_ner), activation="softmax"), name="ner_output"
|
||||
)(bilstm)
|
||||
srl_output = TimeDistributed(
|
||||
Dense(len(tag2idx_srl), activation="softmax"), name="srl_output"
|
||||
)(bilstm)
|
||||
|
||||
model = Model(inputs=input_layer, outputs=[ner_output, srl_output])
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss={
|
||||
"ner_output": "categorical_crossentropy",
|
||||
"srl_output": "categorical_crossentropy",
|
||||
},
|
||||
metrics={"ner_output": "accuracy", "srl_output": "accuracy"},
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# ---------- 5. Training ----------
|
||||
model.fit(
|
||||
X, {"ner_output": y_ner, "srl_output": y_srl}, batch_size=2, epochs=10, verbose=1
|
||||
)
|
||||
|
||||
# ---------- 6. Simpan artefak ----------
|
||||
model.save("NER_SRL/multi_task_bilstm_model.keras")
|
||||
with open("NER_SRL/word2idx.pkl", "wb") as f:
|
||||
pickle.dump(word2idx, f)
|
||||
with open("NER_SRL/tag2idx_ner.pkl", "wb") as f:
|
||||
pickle.dump(tag2idx_ner, f)
|
||||
with open("NER_SRL/tag2idx_srl.pkl", "wb") as f:
|
||||
pickle.dump(tag2idx_srl, f)
|
||||
|
||||
# ---------- 7. Evaluasi ----------
|
||||
y_pred_ner, y_pred_srl = model.predict(X, verbose=0)
|
||||
|
||||
|
||||
def decode(pred, true, idx2tag):
|
||||
true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]
|
||||
pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]
|
||||
return true_tags, pred_tags
|
||||
|
||||
|
||||
true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)
|
||||
true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)
|
||||
|
||||
print("\n📊 [NER] Classification Report:")
|
||||
print(classification_report(true_ner, pred_ner))
|
||||
|
||||
print("\n📊 [SRL] Classification Report:")
|
||||
print(classification_report(true_srl, pred_srl))
|
BIN
tokenizer.pkl
BIN
tokenizer.pkl
Binary file not shown.
Loading…
Reference in New Issue