fix: adjustment on the model training ner, srl with lstm

This commit is contained in:
akhdanre 2025-05-09 22:52:15 +07:00
parent 6318d222e7
commit 647505a8e2
14 changed files with 678 additions and 3883 deletions

BIN
NER_SRL/accuracy_plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

File diff suppressed because one or more lines are too long

BIN
NER_SRL/loss_plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

View File

@ -1,152 +0,0 @@
# ner_srl_multitask.py
# ----------------------------------------------------------
# Train a multitask (Bi)LSTM that predicts NER + SRL tags
# ----------------------------------------------------------
import json, numpy as np, tensorflow as tf
from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional,
TimeDistributed, Dense)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report
# ----------------------------------------------------------
# 1. Load and prepare data
# ----------------------------------------------------------
DATA = json.load(open("../dataset/dataset_ner_srl.json", "r", encoding="utf8"))
# --- token vocabulary -------------------------------------------------
vocab = {"PAD": 0, "UNK": 1}
for sample in DATA:
for tok in sample["tokens"]:
vocab.setdefault(tok.lower(), len(vocab))
# --- label maps -------------------------------------------------------
def build_label_map(key):
tags = {"PAD": 0} # keep 0 for padding
for s in DATA:
for t in s[key]:
tags.setdefault(t, len(tags))
return tags
ner2idx = build_label_map("labels_ner")
srl2idx = build_label_map("labels_srl")
idx2ner = {i: t for t, i in ner2idx.items()}
idx2srl = {i: t for t, i in srl2idx.items()}
# --- sequences --------------------------------------------------------
MAXLEN = max(len(x["tokens"]) for x in DATA)
X = [[vocab.get(tok.lower(), vocab["UNK"]) for tok in s["tokens"]]
for s in DATA]
y_ner = [[ner2idx[t] for t in s["labels_ner"]]
for s in DATA]
y_srl = [[srl2idx[t] for t in s["labels_srl"]]
for s in DATA]
X = pad_sequences(X, maxlen=MAXLEN, padding="post", value=vocab["PAD"])
y_ner = pad_sequences(y_ner, maxlen=MAXLEN, padding="post", value=ner2idx["PAD"])
y_srl = pad_sequences(y_srl, maxlen=MAXLEN, padding="post", value=srl2idx["PAD"])
# --- onehot for softmax ---------------------------------------------
y_ner = to_categorical(y_ner, num_classes=len(ner2idx))
y_srl = to_categorical(y_srl, num_classes=len(srl2idx))
# ----------------------------------------------------------
# 2. Train / validation split
# ----------------------------------------------------------
# *All* arrays must be passed to train_test_split in one call so they
# stay aligned. Orderofreturn = train,test for each array.
X_tr, X_val, y_tr_ner, y_val_ner, y_tr_srl, y_val_srl = train_test_split(
X, y_ner, y_srl, test_size=0.15, random_state=42
)
# ----------------------------------------------------------
# 3. Model definition
# ----------------------------------------------------------
EMB_DIM = 128
LSTM_UNITS = 128
inp = Input(shape=(MAXLEN,))
emb = Embedding(len(vocab), EMB_DIM, mask_zero=True)(inp)
bilstm= Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(emb)
ner_out = TimeDistributed(
Dense(len(ner2idx), activation="softmax"), name="ner")(bilstm)
srl_out = TimeDistributed(
Dense(len(srl2idx), activation="softmax"), name="srl")(bilstm)
model = Model(inp, [ner_out, srl_out])
model.compile(
optimizer="adam",
loss ={"ner": "categorical_crossentropy",
"srl": "categorical_crossentropy"},
metrics={"ner": "accuracy",
"srl": "accuracy"}
)
model.summary()
# ----------------------------------------------------------
# 4. Train
# ----------------------------------------------------------
history = model.fit(
X_tr,
{"ner": y_tr_ner, "srl": y_tr_srl},
validation_data=(X_val, {"ner": y_val_ner, "srl": y_val_srl}),
epochs=15,
batch_size=32,
verbose=2,
)
# ----------------------------------------------------------
# 5. Helper: decode with a mask (so lens always match)
# ----------------------------------------------------------
def decode(pred, idx2tag, mask):
"""
pred : [n, MAXLEN, n_tags] (onehot or probabilities)
mask : [n, MAXLEN] (True for real tokens, False for PAD)
"""
out = []
for seq, m in zip(pred, mask):
tags = [idx2tag[np.argmax(tok)] for tok, keep in zip(seq, m) if keep]
out.append(tags)
return out
# ----------------------------------------------------------
# 6. Evaluation
# ----------------------------------------------------------
y_pred_ner, y_pred_srl = model.predict(X_val, verbose=0)
mask_val = (X_val != vocab["PAD"]) # True for real tokens
true_ner = decode(y_val_ner , idx2ner, mask_val)
pred_ner = decode(y_pred_ner, idx2ner, mask_val)
true_srl = decode(y_val_srl , idx2srl, mask_val)
pred_srl = decode(y_pred_srl, idx2srl, mask_val)
print("\n📊 NER report")
print(classification_report(true_ner, pred_ner))
print("\n📊 SRL report")
print(classification_report(true_srl, pred_srl))
# # ----------------------------------------------------------
# # 7. Quick inference function
# # ----------------------------------------------------------
# def predict_sentence(sentence: str):
# tokens = sentence.strip().split()
# ids = [vocab.get(w.lower(), vocab["UNK"]) for w in tokens]
# ids = pad_sequences([ids], maxlen=MAXLEN, padding="post",
# value=vocab["PAD"])
# mask = (ids != vocab["PAD"])
# p_ner, p_srl = model.predict(ids, verbose=0)
# ner_tags = decode(p_ner , idx2ner , mask)[0]
# srl_tags = decode(p_srl , idx2srl , mask)[0]
# return list(zip(tokens, ner_tags, srl_tags))
# # ---- demo ------------------------------------------------
# if __name__ == "__main__":
# print("\n🔍 Demo:")
# for tok, ner, srl in predict_sentence(
# "Keanekaragaman hayati Indonesia sangat dipengaruhi faktor iklim."):
# print(f"{tok:15} {ner:10} {srl}")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

View File

@ -1,57 +1,52 @@
import json
import numpy as np
import pickle
from tensorflow.keras.models import load_model # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
model = load_model("multi_task_lstm_ner_srl_model_tf.keras")
# -----------------------------
# 1. Load artefak
# -----------------------------
MODEL_PATH = "lstm_ner_srl_model.keras" # ← nama file baru
model = load_model(MODEL_PATH)
with open("word2idx.pkl", "rb") as f:
word2idx = pickle.load(f)
with open("tag2idx_ner.pkl", "rb") as f:
tag2idx_ner = pickle.load(f)
with open("tag2idx_srl.pkl", "rb") as f:
tag2idx_srl = pickle.load(f)
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
max = 50
PAD_WORD_ID = word2idx["PAD"] # 0
MAXLEN = model.input_shape[1] # ambil langsung dari model
def predict_sentence(sentence):
# -----------------------------
# 2. Fungsi prediksi
# -----------------------------
def predict_sentence(sentence: str) -> dict:
tokens = sentence.strip().lower().split()
print(tokens)
seq = [word2idx.get(tok, word2idx["UNK"]) for tok in tokens]
seq = pad_sequences([seq], maxlen=MAXLEN, padding="post", value=PAD_WORD_ID)
x = [word2idx.get(w.lower(), word2idx["UNK"]) for w in tokens]
x = pad_sequences([x], maxlen=50, padding="post", value=word2idx["PAD"])
pred_ner_prob, pred_srl_prob = model.predict(seq, verbose=0)
pred_ner = pred_ner_prob.argmax(-1)[0][: len(tokens)]
pred_srl = pred_srl_prob.argmax(-1)[0][: len(tokens)]
preds = model.predict(x)
pred_labels_ner = np.argmax(preds[0], axis=-1)[0]
pred_labels_srl = np.argmax(preds[1], axis=-1)[0]
result = {
return {
"tokens": tokens,
"labels_ner": [
idx2tag_ner[int(label)] for label in pred_labels_ner[: len(tokens)]
],
"labels_srl": [
idx2tag_srl[int(label)] for label in pred_labels_srl[: len(tokens)]
],
"labels_ner": [idx2tag_ner[int(i)] for i in pred_ner],
"labels_srl": [idx2tag_srl[int(i)] for i in pred_srl],
}
return result
# -----------------------------
# 3. Demo
# -----------------------------
if __name__ == "__main__":
try:
sentence = "sore ini aku pergi ke indonesia"
print(predict_sentence(sentence))
except KeyboardInterrupt:
print("\n\nSelesai.")
sample = "Suku Karo merayakan upacara pada juni"
result = predict_sentence(sample)
print(json.dumps(result, ensure_ascii=False, indent=2))

View File

@ -1,107 +0,0 @@
import json, pickle
import numpy as np
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from seqeval.metrics import classification_report
# ---------- 1. Muat data ----------
with open("dataset/dataset_ner_srl.json", encoding="utf-8") as f:
data = json.load(f)
sentences = [[tok.lower() for tok in item["tokens"]] for item in data]
labels_ner = [item["labels_ner"] for item in data]
labels_srl = [item["labels_srl"] for item in data]
for i, label_seq in enumerate(labels_ner):
if "V" in label_seq:
print(f"Label 'V' ditemukan di index {i}: {label_seq}")
# ---------- 2. Bangun vocab & label map ----------
words = sorted({w for s in sentences for w in s})
ner_tags = sorted({t for seq in labels_ner for t in seq})
srl_tags = sorted({t for seq in labels_srl for t in seq})
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"], word2idx["UNK"] = 0, 1
tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}
tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
# ---------- 3. Encoding token & label ----------
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]
y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]
maxlen = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
y_ner = pad_sequences(y_ner, maxlen=maxlen, padding="post", value=tag2idx_ner["O"])
y_srl = pad_sequences(y_srl, maxlen=maxlen, padding="post", value=tag2idx_srl["O"])
y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]
y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]
# cast ke np.array biar Keras happy
X = np.array(X)
y_ner = np.array(y_ner)
y_srl = np.array(y_srl)
# ---------- 4. Arsitektur BiLSTM multitask ----------
input_layer = Input(shape=(maxlen,))
embed = Embedding(len(word2idx), 64)(input_layer)
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)
ner_output = TimeDistributed(
Dense(len(tag2idx_ner), activation="softmax"), name="ner_output"
)(bilstm)
srl_output = TimeDistributed(
Dense(len(tag2idx_srl), activation="softmax"), name="srl_output"
)(bilstm)
model = Model(inputs=input_layer, outputs=[ner_output, srl_output])
model.compile(
optimizer="adam",
loss={
"ner_output": "categorical_crossentropy",
"srl_output": "categorical_crossentropy",
},
metrics={"ner_output": "accuracy", "srl_output": "accuracy"},
)
model.summary()
# ---------- 5. Training ----------
model.fit(
X, {"ner_output": y_ner, "srl_output": y_srl}, batch_size=2, epochs=10, verbose=1
)
# ---------- 6. Simpan artefak ----------
model.save("NER_SRL/multi_task_bilstm_model.keras")
with open("NER_SRL/word2idx.pkl", "wb") as f:
pickle.dump(word2idx, f)
with open("NER_SRL/tag2idx_ner.pkl", "wb") as f:
pickle.dump(tag2idx_ner, f)
with open("NER_SRL/tag2idx_srl.pkl", "wb") as f:
pickle.dump(tag2idx_srl, f)
# ---------- 7. Evaluasi ----------
y_pred_ner, y_pred_srl = model.predict(X, verbose=0)
def decode(pred, true, idx2tag):
true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]
pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]
return true_tags, pred_tags
true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)
true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)
print("\n📊 [NER] Classification Report:")
print(classification_report(true_ner, pred_ner))
print("\n📊 [SRL] Classification Report:")
print(classification_report(true_srl, pred_srl))

Binary file not shown.