TIF_E41211115_lstm-quiz-gen.../NER/lstm_ner_qc.py

94 lines
2.6 KiB
Python

import json
import numpy as np
from keras.models import Sequential
from keras.layers import (
Embedding,
LSTM,
Dense,
TimeDistributed,
Bidirectional,
InputLayer,
)
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from seqeval.metrics import classification_report
import pickle
with open("dataset/lstm_ner_dataset.json", "r", encoding="utf-8") as f:
data = json.load(f)
total_bLoc = 0
total_o = 0
total_b_per = 0
total_i_per = 0
for idx, block in enumerate(data, start=1):
for token in block["labels"]:
if token == "B-LOC":
total_bLoc += 1
elif token == "O":
total_o += 1
elif token == "B-PER":
total_b_per += 1
elif token == "I-PER":
total_i_per += 1
print("Total B-LOC:", total_bLoc)
print("Total O:", total_o)
print("Total B-PER:", total_b_per)
print("Total I-PER:", total_i_per)
print("Total B-PER + I-PER:", total_b_per + total_i_per)
sentences = [[token.lower() for token in item["tokens"]] for item in data]
labels = [item["labels"] for item in data]
words = list(set(word for sentence in sentences for word in sentence))
tags = list(set(tag for label_seq in labels for tag in label_seq))
word2idx = {word: idx + 2 for idx, word in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1
tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {i: t for t, i in tag2idx.items()}
X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
y = [[tag2idx[t] for t in ts] for ts in labels]
maxlen = max(len(x) for x in X)
X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
y = pad_sequences(y, maxlen=maxlen, padding="post", value=tag2idx["O"])
y = [to_categorical(seq, num_classes=len(tag2idx)) for seq in y]
model = Sequential()
model.add(InputLayer(input_shape=(maxlen,)))
model.add(Embedding(input_dim=len(word2idx), output_dim=64))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2idx), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()
model.fit(X, np.array(y), batch_size=2, epochs=10)
model.save("NER/ner_bilstm_model.keras")
with open("NER/word2idx.pkl", "wb") as f:
pickle.dump(word2idx, f)
with open("NER/tag2idx.pkl", "wb") as f:
pickle.dump(tag2idx, f)
y_true = [[idx2tag[np.argmax(token)] for token in seq] for seq in y]
y_pred = model.predict(X)
y_pred = [[idx2tag[np.argmax(token)] for token in seq] for seq in y_pred]
print(classification_report(y_true, y_pred))