feat: adding dataset and fix the code ner srl with lstm
This commit is contained in:
parent
42816580aa
commit
1743cc1e18
|
@ -16,7 +16,7 @@ from seqeval.metrics import classification_report
|
|||
import pickle
|
||||
|
||||
|
||||
with open("dataset/lstm_ner_dataset.json", "r", encoding="utf-8") as f:
|
||||
with open("dataset/dataset_ner_srl.json", "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
|
||||
|
@ -26,7 +26,7 @@ total_b_per = 0
|
|||
total_i_per = 0
|
||||
|
||||
for idx, block in enumerate(data, start=1):
|
||||
for token in block["labels"]:
|
||||
for token in block["labels_ner"]:
|
||||
if token == "B-LOC":
|
||||
total_bLoc += 1
|
||||
elif token == "O":
|
||||
|
@ -43,7 +43,7 @@ print("Total I-PER:", total_i_per)
|
|||
print("Total B-PER + I-PER:", total_b_per + total_i_per)
|
||||
|
||||
sentences = [[token.lower() for token in item["tokens"]] for item in data]
|
||||
labels = [item["labels"] for item in data]
|
||||
labels = [item["labels_ner"] for item in data]
|
||||
|
||||
|
||||
words = list(set(word for sentence in sentences for word in sentence))
|
||||
|
|
Binary file not shown.
BIN
NER/tag2idx.pkl
BIN
NER/tag2idx.pkl
Binary file not shown.
|
@ -35,7 +35,7 @@ def predict_sentence(sentence):
|
|||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
sentence = "korea adalah tempat lahir jun"
|
||||
sentence = "saat ini indonesia adalah negara yang sangat indah"
|
||||
predict_sentence(sentence)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nSelesai.")
|
||||
|
|
BIN
NER/word2idx.pkl
BIN
NER/word2idx.pkl
Binary file not shown.
|
@ -0,0 +1,152 @@
|
|||
# ner_srl_multitask.py
|
||||
# ----------------------------------------------------------
|
||||
# Train a multi‑task (Bi)LSTM that predicts NER + SRL tags
|
||||
# ----------------------------------------------------------
|
||||
import json, numpy as np, tensorflow as tf
|
||||
from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional,
|
||||
TimeDistributed, Dense)
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.utils import to_categorical
|
||||
from sklearn.model_selection import train_test_split
|
||||
from seqeval.metrics import classification_report
|
||||
# ----------------------------------------------------------
|
||||
# 1. Load and prepare data
|
||||
# ----------------------------------------------------------
|
||||
DATA = json.load(open("../dataset/dataset_ner_srl.json", "r", encoding="utf8"))
|
||||
|
||||
# --- token vocabulary -------------------------------------------------
|
||||
vocab = {"PAD": 0, "UNK": 1}
|
||||
for sample in DATA:
|
||||
for tok in sample["tokens"]:
|
||||
vocab.setdefault(tok.lower(), len(vocab))
|
||||
|
||||
# --- label maps -------------------------------------------------------
|
||||
def build_label_map(key):
|
||||
tags = {"PAD": 0} # keep 0 for padding
|
||||
for s in DATA:
|
||||
for t in s[key]:
|
||||
tags.setdefault(t, len(tags))
|
||||
return tags
|
||||
|
||||
ner2idx = build_label_map("labels_ner")
|
||||
srl2idx = build_label_map("labels_srl")
|
||||
idx2ner = {i: t for t, i in ner2idx.items()}
|
||||
idx2srl = {i: t for t, i in srl2idx.items()}
|
||||
|
||||
# --- sequences --------------------------------------------------------
|
||||
MAXLEN = max(len(x["tokens"]) for x in DATA)
|
||||
|
||||
X = [[vocab.get(tok.lower(), vocab["UNK"]) for tok in s["tokens"]]
|
||||
for s in DATA]
|
||||
y_ner = [[ner2idx[t] for t in s["labels_ner"]]
|
||||
for s in DATA]
|
||||
y_srl = [[srl2idx[t] for t in s["labels_srl"]]
|
||||
for s in DATA]
|
||||
|
||||
X = pad_sequences(X, maxlen=MAXLEN, padding="post", value=vocab["PAD"])
|
||||
y_ner = pad_sequences(y_ner, maxlen=MAXLEN, padding="post", value=ner2idx["PAD"])
|
||||
y_srl = pad_sequences(y_srl, maxlen=MAXLEN, padding="post", value=srl2idx["PAD"])
|
||||
|
||||
# --- one‑hot for softmax ---------------------------------------------
|
||||
y_ner = to_categorical(y_ner, num_classes=len(ner2idx))
|
||||
y_srl = to_categorical(y_srl, num_classes=len(srl2idx))
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 2. Train / validation split
|
||||
# ----------------------------------------------------------
|
||||
# *All* arrays must be passed to train_test_split in one call so they
|
||||
# stay aligned. Order‑of‑return = train,test for each array.
|
||||
X_tr, X_val, y_tr_ner, y_val_ner, y_tr_srl, y_val_srl = train_test_split(
|
||||
X, y_ner, y_srl, test_size=0.15, random_state=42
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 3. Model definition
|
||||
# ----------------------------------------------------------
|
||||
EMB_DIM = 128
|
||||
LSTM_UNITS = 128
|
||||
|
||||
inp = Input(shape=(MAXLEN,))
|
||||
emb = Embedding(len(vocab), EMB_DIM, mask_zero=True)(inp)
|
||||
bilstm= Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(emb)
|
||||
|
||||
ner_out = TimeDistributed(
|
||||
Dense(len(ner2idx), activation="softmax"), name="ner")(bilstm)
|
||||
srl_out = TimeDistributed(
|
||||
Dense(len(srl2idx), activation="softmax"), name="srl")(bilstm)
|
||||
|
||||
model = Model(inp, [ner_out, srl_out])
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss ={"ner": "categorical_crossentropy",
|
||||
"srl": "categorical_crossentropy"},
|
||||
metrics={"ner": "accuracy",
|
||||
"srl": "accuracy"}
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 4. Train
|
||||
# ----------------------------------------------------------
|
||||
history = model.fit(
|
||||
X_tr,
|
||||
{"ner": y_tr_ner, "srl": y_tr_srl},
|
||||
validation_data=(X_val, {"ner": y_val_ner, "srl": y_val_srl}),
|
||||
epochs=15,
|
||||
batch_size=32,
|
||||
verbose=2,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 5. Helper: decode with a mask (so lens always match)
|
||||
# ----------------------------------------------------------
|
||||
def decode(pred, idx2tag, mask):
|
||||
"""
|
||||
pred : [n, MAXLEN, n_tags] (one‑hot or probabilities)
|
||||
mask : [n, MAXLEN] (True for real tokens, False for PAD)
|
||||
"""
|
||||
out = []
|
||||
for seq, m in zip(pred, mask):
|
||||
tags = [idx2tag[np.argmax(tok)] for tok, keep in zip(seq, m) if keep]
|
||||
out.append(tags)
|
||||
return out
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# 6. Evaluation
|
||||
# ----------------------------------------------------------
|
||||
y_pred_ner, y_pred_srl = model.predict(X_val, verbose=0)
|
||||
|
||||
mask_val = (X_val != vocab["PAD"]) # True for real tokens
|
||||
|
||||
true_ner = decode(y_val_ner , idx2ner, mask_val)
|
||||
pred_ner = decode(y_pred_ner, idx2ner, mask_val)
|
||||
true_srl = decode(y_val_srl , idx2srl, mask_val)
|
||||
pred_srl = decode(y_pred_srl, idx2srl, mask_val)
|
||||
|
||||
print("\n📊 NER report")
|
||||
print(classification_report(true_ner, pred_ner))
|
||||
|
||||
print("\n📊 SRL report")
|
||||
print(classification_report(true_srl, pred_srl))
|
||||
|
||||
# # ----------------------------------------------------------
|
||||
# # 7. Quick inference function
|
||||
# # ----------------------------------------------------------
|
||||
# def predict_sentence(sentence: str):
|
||||
# tokens = sentence.strip().split()
|
||||
# ids = [vocab.get(w.lower(), vocab["UNK"]) for w in tokens]
|
||||
# ids = pad_sequences([ids], maxlen=MAXLEN, padding="post",
|
||||
# value=vocab["PAD"])
|
||||
# mask = (ids != vocab["PAD"])
|
||||
# p_ner, p_srl = model.predict(ids, verbose=0)
|
||||
# ner_tags = decode(p_ner , idx2ner , mask)[0]
|
||||
# srl_tags = decode(p_srl , idx2srl , mask)[0]
|
||||
# return list(zip(tokens, ner_tags, srl_tags))
|
||||
|
||||
# # ---- demo ------------------------------------------------
|
||||
# if __name__ == "__main__":
|
||||
# print("\n🔍 Demo:")
|
||||
# for tok, ner, srl in predict_sentence(
|
||||
# "Keanekaragaman hayati Indonesia sangat dipengaruhi faktor iklim."):
|
||||
# print(f"{tok:15} {ner:10} {srl}")
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,52 @@
|
|||
import json
|
||||
import numpy as np
|
||||
import pickle
|
||||
|
||||
from keras.models import load_model
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
|
||||
model = load_model("multi_task_bilstm_model.keras")
|
||||
|
||||
with open("word2idx.pkl", "rb") as f:
|
||||
word2idx = pickle.load(f)
|
||||
|
||||
with open("tag2idx_ner.pkl", "rb") as f:
|
||||
tag2idx_ner = pickle.load(f)
|
||||
|
||||
with open("tag2idx_srl.pkl", "rb") as f:
|
||||
tag2idx_srl = pickle.load(f)
|
||||
|
||||
|
||||
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
|
||||
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
|
||||
|
||||
|
||||
max = 50
|
||||
|
||||
|
||||
def predict_sentence(sentence):
|
||||
tokens = sentence.strip().lower().split()
|
||||
print(tokens)
|
||||
x = [word2idx.get(w.lower(), word2idx["UNK"]) for w in tokens]
|
||||
x = pad_sequences([x], maxlen=50, padding="post", value=word2idx["PAD"])
|
||||
|
||||
preds = model.predict(x)
|
||||
pred_labels_ner = np.argmax(preds[0], axis=-1)[0]
|
||||
pred_labels_srl = np.argmax(preds[1], axis=-1)[0]
|
||||
|
||||
print("Hasil prediksi NER:")
|
||||
for token, label_idx in zip(tokens, pred_labels_ner[: len(tokens)]):
|
||||
print(f"{token}\t{idx2tag_ner[int(label_idx)]}")
|
||||
|
||||
print("\nHasil prediksi SRL:")
|
||||
for token, label_idx in zip(tokens, pred_labels_srl[: len(tokens)]):
|
||||
print(f"{token}\t{idx2tag_srl[int(label_idx)]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
sentence = "aku lahir di indonesia"
|
||||
predict_sentence(sentence)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nSelesai.")
|
|
@ -0,0 +1,107 @@
|
|||
import json, pickle
|
||||
import numpy as np
|
||||
from keras.models import Model
|
||||
from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from keras.utils import to_categorical
|
||||
from seqeval.metrics import classification_report
|
||||
|
||||
# ---------- 1. Muat data ----------
|
||||
with open("dataset/dataset_ner_srl.json", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
sentences = [[tok.lower() for tok in item["tokens"]] for item in data]
|
||||
labels_ner = [item["labels_ner"] for item in data]
|
||||
labels_srl = [item["labels_srl"] for item in data]
|
||||
|
||||
for i, label_seq in enumerate(labels_ner):
|
||||
if "V" in label_seq:
|
||||
print(f"Label 'V' ditemukan di index {i}: {label_seq}")
|
||||
|
||||
# ---------- 2. Bangun vocab & label map ----------
|
||||
words = sorted({w for s in sentences for w in s})
|
||||
ner_tags = sorted({t for seq in labels_ner for t in seq})
|
||||
srl_tags = sorted({t for seq in labels_srl for t in seq})
|
||||
|
||||
word2idx = {w: i + 2 for i, w in enumerate(words)}
|
||||
word2idx["PAD"], word2idx["UNK"] = 0, 1
|
||||
|
||||
tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}
|
||||
tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}
|
||||
idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
|
||||
idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
|
||||
|
||||
# ---------- 3. Encoding token & label ----------
|
||||
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
|
||||
y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]
|
||||
y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]
|
||||
|
||||
maxlen = max(len(seq) for seq in X)
|
||||
|
||||
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
|
||||
y_ner = pad_sequences(y_ner, maxlen=maxlen, padding="post", value=tag2idx_ner["O"])
|
||||
y_srl = pad_sequences(y_srl, maxlen=maxlen, padding="post", value=tag2idx_srl["O"])
|
||||
|
||||
y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]
|
||||
y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]
|
||||
|
||||
# cast ke np.array biar Keras happy
|
||||
X = np.array(X)
|
||||
y_ner = np.array(y_ner)
|
||||
y_srl = np.array(y_srl)
|
||||
|
||||
# ---------- 4. Arsitektur BiLSTM multi‑task ----------
|
||||
input_layer = Input(shape=(maxlen,))
|
||||
embed = Embedding(len(word2idx), 64)(input_layer)
|
||||
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)
|
||||
|
||||
ner_output = TimeDistributed(
|
||||
Dense(len(tag2idx_ner), activation="softmax"), name="ner_output"
|
||||
)(bilstm)
|
||||
srl_output = TimeDistributed(
|
||||
Dense(len(tag2idx_srl), activation="softmax"), name="srl_output"
|
||||
)(bilstm)
|
||||
|
||||
model = Model(inputs=input_layer, outputs=[ner_output, srl_output])
|
||||
model.compile(
|
||||
optimizer="adam",
|
||||
loss={
|
||||
"ner_output": "categorical_crossentropy",
|
||||
"srl_output": "categorical_crossentropy",
|
||||
},
|
||||
metrics={"ner_output": "accuracy", "srl_output": "accuracy"},
|
||||
)
|
||||
model.summary()
|
||||
|
||||
# ---------- 5. Training ----------
|
||||
model.fit(
|
||||
X, {"ner_output": y_ner, "srl_output": y_srl}, batch_size=2, epochs=10, verbose=1
|
||||
)
|
||||
|
||||
# ---------- 6. Simpan artefak ----------
|
||||
model.save("NER_SRL/multi_task_bilstm_model.keras")
|
||||
with open("NER_SRL/word2idx.pkl", "wb") as f:
|
||||
pickle.dump(word2idx, f)
|
||||
with open("NER_SRL/tag2idx_ner.pkl", "wb") as f:
|
||||
pickle.dump(tag2idx_ner, f)
|
||||
with open("NER_SRL/tag2idx_srl.pkl", "wb") as f:
|
||||
pickle.dump(tag2idx_srl, f)
|
||||
|
||||
# ---------- 7. Evaluasi ----------
|
||||
y_pred_ner, y_pred_srl = model.predict(X, verbose=0)
|
||||
|
||||
|
||||
def decode(pred, true, idx2tag):
|
||||
true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]
|
||||
pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]
|
||||
return true_tags, pred_tags
|
||||
|
||||
|
||||
true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)
|
||||
true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)
|
||||
|
||||
print("\n📊 [NER] Classification Report:")
|
||||
print(classification_report(true_ner, pred_ner))
|
||||
|
||||
print("\n📊 [SRL] Classification Report:")
|
||||
print(classification_report(true_srl, pred_srl))
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue