fix: adjustment on the model training ner, srl with lstm
This commit is contained in:
parent
6318d222e7
commit
647505a8e2
Binary file not shown.
After Width: | Height: | Size: 65 KiB |
File diff suppressed because one or more lines are too long
Binary file not shown.
After Width: | Height: | Size: 63 KiB |
152
NER_SRL/lses.py
152
NER_SRL/lses.py
|
@ -1,152 +0,0 @@
|
|||
# ner_srl_multitask.py
|
||||
# ----------------------------------------------------------
|
||||
# Train a multi‑task (Bi)LSTM that predicts NER + SRL tags
|
||||
# ----------------------------------------------------------
|
||||
import json, numpy as np, tensorflow as tf
|
||||
from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional,
|
||||
TimeDistributed, Dense)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.utils import to_categorical
|
||||
from sklearn.model_selection import train_test_split
|
||||
from seqeval.metrics import classification_report
|
||||
# ----------------------------------------------------------
|
||||
# 1. Load and prepare data
|
||||
# ----------------------------------------------------------
|
||||
DATA = json.load(open("../dataset/dataset_ner_srl.json", "r", encoding="utf8"))
|
||||
|
||||
# --- token vocabulary -------------------------------------------------
|
||||
vocab = {"PAD": 0, "UNK": 1}
|
||||
for sample in DATA:
|
||||
for tok in sample["tokens"]:
|
||||
vocab.setdefault(tok.lower(), len(vocab))
|
||||
|
||||
# --- label maps -------------------------------------------------------
|
||||
def build_label_map(key):
|
||||
tags = {"PAD": 0} # keep 0 for padding
|
||||
for s in DATA:
|
||||
for t in s[key]:
|
||||
tags.setdefault(t, len(tags))
|
||||
return tags
|
||||
|
||||
ner2idx = build_label_map("labels_ner")
|
||||
srl2idx = build_label_map("labels_srl")
|
||||
idx2ner = {i: t for t, i in ner2idx.items()}
|
||||
idx2srl = {i: t for t, i in srl2idx.items()}
|
||||
|
||||
# --- sequences --------------------------------------------------------
|
||||
MAXLEN = max(len(x["tokens"]) for x in DATA)
|
||||
|
||||
X = [[vocab.get(tok.lower(), vocab["UNK"]) for tok in s["tokens"]]
|
||||
for s in DATA]
|
||||
y_ner = [[ner2idx[t] for t in s["labels_ner"]]
|
||||
for s in DATA]
|
||||
y_srl = [[srl2idx[t] for t in s["labels_srl"]]
|
||||
for s in DATA]
|
||||
|
||||
X = pad_sequences(X, maxlen=MAXLEN, padding="post", value=vocab["PAD"])
|
||||
y_ner = pad_sequences(y_ner, maxlen=MAXLEN, padding="post", value=ner2idx["PAD"])
|
||||
y_srl = pad_sequences(y_srl, maxlen=MAXLEN, padding="post", value=srl2idx["PAD"])
|
||||
|
||||
# --- one‑hot for softmax ---------------------------------------------
|
||||
y_ner = to_categorical(y_ner, num_classes=len(ner2idx))
|
||||
y_srl = to_categorical(y_srl, num_classes=len(srl2idx))
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 2. Train / validation split
|
||||
# ----------------------------------------------------------
|
||||
# *All* arrays must be passed to train_test_split in one call so they
|
||||
# stay aligned. Order‑of‑return = train,test for each array.
|
||||
X_tr, X_val, y_tr_ner, y_val_ner, y_tr_srl, y_val_srl = train_test_split(
|
||||
X, y_ner, y_srl, test_size=0.15, random_state=42
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 3. Model definition
|
||||
# ----------------------------------------------------------
|
||||
EMB_DIM = 128
|
||||
LSTM_UNITS = 128
|
||||
|
||||
inp = Input(shape=(MAXLEN,))
|
||||
emb = Embedding(len(vocab), EMB_DIM, mask_zero=True)(inp)
|
||||
bilstm= Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(emb)
|
||||
|
||||
ner_out = TimeDistributed(
|
||||
Dense(len(ner2idx), activation="softmax"), name="ner")(bilstm)
|
||||
srl_out = TimeDistributed(
|
||||
Dense(len(srl2idx), activation="softmax"), name="srl")(bilstm)
|
||||
|
||||
model = Model(inp, [ner_out, srl_out])
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss ={"ner": "categorical_crossentropy",
|
||||
"srl": "categorical_crossentropy"},
|
||||
metrics={"ner": "accuracy",
|
||||
"srl": "accuracy"}
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 4. Train
|
||||
# ----------------------------------------------------------
|
||||
history = model.fit(
|
||||
X_tr,
|
||||
{"ner": y_tr_ner, "srl": y_tr_srl},
|
||||
validation_data=(X_val, {"ner": y_val_ner, "srl": y_val_srl}),
|
||||
epochs=15,
|
||||
batch_size=32,
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 5. Helper: decode with a mask (so lens always match)
|
||||
# ----------------------------------------------------------
|
||||
def decode(pred, idx2tag, mask):
|
||||
"""
|
||||
pred : [n, MAXLEN, n_tags] (one‑hot or probabilities)
|
||||
mask : [n, MAXLEN] (True for real tokens, False for PAD)
|
||||
"""
|
||||
out = []
|
||||
for seq, m in zip(pred, mask):
|
||||
tags = [idx2tag[np.argmax(tok)] for tok, keep in zip(seq, m) if keep]
|
||||
out.append(tags)
|
||||
return out
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 6. Evaluation
|
||||
# ----------------------------------------------------------
|
||||
y_pred_ner, y_pred_srl = model.predict(X_val, verbose=0)
|
||||
|
||||
mask_val = (X_val != vocab["PAD"]) # True for real tokens
|
||||
|
||||
true_ner = decode(y_val_ner , idx2ner, mask_val)
|
||||
pred_ner = decode(y_pred_ner, idx2ner, mask_val)
|
||||
true_srl = decode(y_val_srl , idx2srl, mask_val)
|
||||
pred_srl = decode(y_pred_srl, idx2srl, mask_val)
|
||||
|
||||
print("\n📊 NER report")
|
||||
print(classification_report(true_ner, pred_ner))
|
||||
|
||||
print("\n📊 SRL report")
|
||||
print(classification_report(true_srl, pred_srl))
|
||||
|
||||
# # ----------------------------------------------------------
|
||||
# # 7. Quick inference function
|
||||
# # ----------------------------------------------------------
|
||||
# def predict_sentence(sentence: str):
|
||||
# tokens = sentence.strip().split()
|
||||
# ids = [vocab.get(w.lower(), vocab["UNK"]) for w in tokens]
|
||||
# ids = pad_sequences([ids], maxlen=MAXLEN, padding="post",
|
||||
# value=vocab["PAD"])
|
||||
# mask = (ids != vocab["PAD"])
|
||||
# p_ner, p_srl = model.predict(ids, verbose=0)
|
||||
# ner_tags = decode(p_ner , idx2ner , mask)[0]
|
||||
# srl_tags = decode(p_srl , idx2srl , mask)[0]
|
||||
# return list(zip(tokens, ner_tags, srl_tags))
|
||||
|
||||
# # ---- demo ------------------------------------------------
|
||||
# if __name__ == "__main__":
|
||||
# print("\n🔍 Demo:")
|
||||
# for tok, ner, srl in predict_sentence(
|
||||
# "Keanekaragaman hayati Indonesia sangat dipengaruhi faktor iklim."):
|
||||
# print(f"{tok:15} {ner:10} {srl}")
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
|
@ -1,57 +1,52 @@
|
|||
import json
|
||||
import numpy as np
|
||||
import pickle
|
||||
from tensorflow.keras.models import load_model # type: ignore
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
|
||||
|
||||
from keras.models import load_model
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
|
||||
model = load_model("multi_task_lstm_ner_srl_model_tf.keras")
|
||||
# -----------------------------
|
||||
# 1. Load artefak
|
||||
# -----------------------------
|
||||
MODEL_PATH = "lstm_ner_srl_model.keras" # ← nama file baru
|
||||
model = load_model(MODEL_PATH)
|
||||
|
||||
with open("word2idx.pkl", "rb") as f:
|
||||
word2idx = pickle.load(f)
|
||||
|
||||
with open("tag2idx_ner.pkl", "rb") as f:
|
||||
tag2idx_ner = pickle.load(f)
|
||||
|
||||
with open("tag2idx_srl.pkl", "rb") as f:
|
||||
tag2idx_srl = pickle.load(f)
|
||||
|
||||
|
||||
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
|
||||
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
|
||||
|
||||
|
||||
max = 50
|
||||
PAD_WORD_ID = word2idx["PAD"] # 0
|
||||
MAXLEN = model.input_shape[1] # ambil langsung dari model
|
||||
|
||||
|
||||
def predict_sentence(sentence):
|
||||
# -----------------------------
|
||||
# 2. Fungsi prediksi
|
||||
# -----------------------------
|
||||
def predict_sentence(sentence: str) -> dict:
|
||||
tokens = sentence.strip().lower().split()
|
||||
print(tokens)
|
||||
seq = [word2idx.get(tok, word2idx["UNK"]) for tok in tokens]
|
||||
seq = pad_sequences([seq], maxlen=MAXLEN, padding="post", value=PAD_WORD_ID)
|
||||
|
||||
x = [word2idx.get(w.lower(), word2idx["UNK"]) for w in tokens]
|
||||
x = pad_sequences([x], maxlen=50, padding="post", value=word2idx["PAD"])
|
||||
pred_ner_prob, pred_srl_prob = model.predict(seq, verbose=0)
|
||||
pred_ner = pred_ner_prob.argmax(-1)[0][: len(tokens)]
|
||||
pred_srl = pred_srl_prob.argmax(-1)[0][: len(tokens)]
|
||||
|
||||
preds = model.predict(x)
|
||||
pred_labels_ner = np.argmax(preds[0], axis=-1)[0]
|
||||
pred_labels_srl = np.argmax(preds[1], axis=-1)[0]
|
||||
|
||||
result = {
|
||||
return {
|
||||
"tokens": tokens,
|
||||
"labels_ner": [
|
||||
idx2tag_ner[int(label)] for label in pred_labels_ner[: len(tokens)]
|
||||
],
|
||||
"labels_srl": [
|
||||
idx2tag_srl[int(label)] for label in pred_labels_srl[: len(tokens)]
|
||||
],
|
||||
"labels_ner": [idx2tag_ner[int(i)] for i in pred_ner],
|
||||
"labels_srl": [idx2tag_srl[int(i)] for i in pred_srl],
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# 3. Demo
|
||||
# -----------------------------
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
sentence = "sore ini aku pergi ke indonesia"
|
||||
print(predict_sentence(sentence))
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nSelesai.")
|
||||
sample = "Suku Karo merayakan upacara pada juni"
|
||||
result = predict_sentence(sample)
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
|
|
107
NER_SRL/train.py
107
NER_SRL/train.py
|
@ -1,107 +0,0 @@
|
|||
import json, pickle
|
||||
import numpy as np
|
||||
from keras.models import Model
|
||||
from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.utils import to_categorical
|
||||
from seqeval.metrics import classification_report
|
||||
|
||||
# ---------- 1. Muat data ----------
|
||||
with open("dataset/dataset_ner_srl.json", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
sentences = [[tok.lower() for tok in item["tokens"]] for item in data]
|
||||
labels_ner = [item["labels_ner"] for item in data]
|
||||
labels_srl = [item["labels_srl"] for item in data]
|
||||
|
||||
for i, label_seq in enumerate(labels_ner):
|
||||
if "V" in label_seq:
|
||||
print(f"Label 'V' ditemukan di index {i}: {label_seq}")
|
||||
|
||||
# ---------- 2. Bangun vocab & label map ----------
|
||||
words = sorted({w for s in sentences for w in s})
|
||||
ner_tags = sorted({t for seq in labels_ner for t in seq})
|
||||
srl_tags = sorted({t for seq in labels_srl for t in seq})
|
||||
|
||||
word2idx = {w: i + 2 for i, w in enumerate(words)}
|
||||
word2idx["PAD"], word2idx["UNK"] = 0, 1
|
||||
|
||||
tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}
|
||||
tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}
|
||||
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
|
||||
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
|
||||
|
||||
# ---------- 3. Encoding token & label ----------
|
||||
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
|
||||
y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]
|
||||
y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]
|
||||
|
||||
maxlen = max(len(seq) for seq in X)
|
||||
|
||||
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
|
||||
y_ner = pad_sequences(y_ner, maxlen=maxlen, padding="post", value=tag2idx_ner["O"])
|
||||
y_srl = pad_sequences(y_srl, maxlen=maxlen, padding="post", value=tag2idx_srl["O"])
|
||||
|
||||
y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]
|
||||
y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]
|
||||
|
||||
# cast ke np.array biar Keras happy
|
||||
X = np.array(X)
|
||||
y_ner = np.array(y_ner)
|
||||
y_srl = np.array(y_srl)
|
||||
|
||||
# ---------- 4. Arsitektur BiLSTM multi‑task ----------
|
||||
input_layer = Input(shape=(maxlen,))
|
||||
embed = Embedding(len(word2idx), 64)(input_layer)
|
||||
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)
|
||||
|
||||
ner_output = TimeDistributed(
|
||||
Dense(len(tag2idx_ner), activation="softmax"), name="ner_output"
|
||||
)(bilstm)
|
||||
srl_output = TimeDistributed(
|
||||
Dense(len(tag2idx_srl), activation="softmax"), name="srl_output"
|
||||
)(bilstm)
|
||||
|
||||
model = Model(inputs=input_layer, outputs=[ner_output, srl_output])
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss={
|
||||
"ner_output": "categorical_crossentropy",
|
||||
"srl_output": "categorical_crossentropy",
|
||||
},
|
||||
metrics={"ner_output": "accuracy", "srl_output": "accuracy"},
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# ---------- 5. Training ----------
|
||||
model.fit(
|
||||
X, {"ner_output": y_ner, "srl_output": y_srl}, batch_size=2, epochs=10, verbose=1
|
||||
)
|
||||
|
||||
# ---------- 6. Simpan artefak ----------
|
||||
model.save("NER_SRL/multi_task_bilstm_model.keras")
|
||||
with open("NER_SRL/word2idx.pkl", "wb") as f:
|
||||
pickle.dump(word2idx, f)
|
||||
with open("NER_SRL/tag2idx_ner.pkl", "wb") as f:
|
||||
pickle.dump(tag2idx_ner, f)
|
||||
with open("NER_SRL/tag2idx_srl.pkl", "wb") as f:
|
||||
pickle.dump(tag2idx_srl, f)
|
||||
|
||||
# ---------- 7. Evaluasi ----------
|
||||
y_pred_ner, y_pred_srl = model.predict(X, verbose=0)
|
||||
|
||||
|
||||
def decode(pred, true, idx2tag):
|
||||
true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]
|
||||
pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]
|
||||
return true_tags, pred_tags
|
||||
|
||||
|
||||
true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)
|
||||
true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)
|
||||
|
||||
print("\n📊 [NER] Classification Report:")
|
||||
print(classification_report(true_ner, pred_ner))
|
||||
|
||||
print("\n📊 [SRL] Classification Report:")
|
||||
print(classification_report(true_srl, pred_srl))
|
Binary file not shown.
Loading…
Reference in New Issue