94 lines
2.6 KiB
Python
94 lines
2.6 KiB
Python
import json
|
|
|
|
import numpy as np
|
|
from keras.models import Sequential
|
|
from keras.layers import (
|
|
Embedding,
|
|
LSTM,
|
|
Dense,
|
|
TimeDistributed,
|
|
Bidirectional,
|
|
InputLayer,
|
|
)
|
|
from keras.preprocessing.sequence import pad_sequences
|
|
from keras.utils import to_categorical
|
|
from seqeval.metrics import classification_report
|
|
import pickle
|
|
|
|
|
|
with open("dataset/dataset_ner_srl.json", "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
|
|
total_bLoc = 0
|
|
total_o = 0
|
|
total_b_per = 0
|
|
total_i_per = 0
|
|
|
|
for idx, block in enumerate(data, start=1):
|
|
for token in block["labels_ner"]:
|
|
if token == "B-LOC":
|
|
total_bLoc += 1
|
|
elif token == "O":
|
|
total_o += 1
|
|
elif token == "B-PER":
|
|
total_b_per += 1
|
|
elif token == "I-PER":
|
|
total_i_per += 1
|
|
|
|
print("Total B-LOC:", total_bLoc)
|
|
print("Total O:", total_o)
|
|
print("Total B-PER:", total_b_per)
|
|
print("Total I-PER:", total_i_per)
|
|
print("Total B-PER + I-PER:", total_b_per + total_i_per)
|
|
|
|
sentences = [[token.lower() for token in item["tokens"]] for item in data]
|
|
labels = [item["labels_ner"] for item in data]
|
|
|
|
|
|
words = list(set(word for sentence in sentences for word in sentence))
|
|
tags = list(set(tag for label_seq in labels for tag in label_seq))
|
|
|
|
|
|
word2idx = {word: idx + 2 for idx, word in enumerate(words)}
|
|
word2idx["PAD"] = 0
|
|
word2idx["UNK"] = 1
|
|
|
|
tag2idx = {tag: idx for idx, tag in enumerate(tags)}
|
|
idx2tag = {i: t for t, i in tag2idx.items()}
|
|
|
|
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
|
|
y = [[tag2idx[t] for t in ts] for ts in labels]
|
|
|
|
maxlen = max(len(x) for x in X)
|
|
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
|
|
y = pad_sequences(y, maxlen=maxlen, padding="post", value=tag2idx["O"])
|
|
y = [to_categorical(seq, num_classes=len(tag2idx)) for seq in y]
|
|
|
|
model = Sequential()
|
|
model.add(InputLayer(input_shape=(maxlen,)))
|
|
model.add(Embedding(input_dim=len(word2idx), output_dim=64))
|
|
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
|
|
model.add(TimeDistributed(Dense(len(tag2idx), activation="softmax")))
|
|
|
|
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
|
|
model.summary()
|
|
|
|
model.fit(X, np.array(y), batch_size=2, epochs=10)
|
|
|
|
model.save("NER/ner_bilstm_model.keras")
|
|
|
|
|
|
with open("NER/word2idx.pkl", "wb") as f:
|
|
pickle.dump(word2idx, f)
|
|
|
|
with open("NER/tag2idx.pkl", "wb") as f:
|
|
pickle.dump(tag2idx, f)
|
|
|
|
|
|
y_true = [[idx2tag[np.argmax(token)] for token in seq] for seq in y]
|
|
y_pred = model.predict(X)
|
|
y_pred = [[idx2tag[np.argmax(token)] for token in seq] for seq in y_pred]
|
|
|
|
print(classification_report(y_true, y_pred))
|