import json import numpy as np from keras.models import Sequential from keras.layers import ( Embedding, LSTM, Dense, TimeDistributed, Bidirectional, InputLayer, ) from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from seqeval.metrics import classification_report import pickle with open("dataset/dataset_ner_srl.json", "r", encoding="utf-8") as f: data = json.load(f) total_bLoc = 0 total_o = 0 total_b_per = 0 total_i_per = 0 for idx, block in enumerate(data, start=1): for token in block["labels_ner"]: if token == "B-LOC": total_bLoc += 1 elif token == "O": total_o += 1 elif token == "B-PER": total_b_per += 1 elif token == "I-PER": total_i_per += 1 print("Total B-LOC:", total_bLoc) print("Total O:", total_o) print("Total B-PER:", total_b_per) print("Total I-PER:", total_i_per) print("Total B-PER + I-PER:", total_b_per + total_i_per) sentences = [[token.lower() for token in item["tokens"]] for item in data] labels = [item["labels_ner"] for item in data] words = list(set(word for sentence in sentences for word in sentence)) tags = list(set(tag for label_seq in labels for tag in label_seq)) word2idx = {word: idx + 2 for idx, word in enumerate(words)} word2idx["PAD"] = 0 word2idx["UNK"] = 1 tag2idx = {tag: idx for idx, tag in enumerate(tags)} idx2tag = {i: t for t, i in tag2idx.items()} X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences] y = [[tag2idx[t] for t in ts] for ts in labels] maxlen = max(len(x) for x in X) X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"]) y = pad_sequences(y, maxlen=maxlen, padding="post", value=tag2idx["O"]) y = [to_categorical(seq, num_classes=len(tag2idx)) for seq in y] model = Sequential() model.add(InputLayer(input_shape=(maxlen,))) model.add(Embedding(input_dim=len(word2idx), output_dim=64)) model.add(Bidirectional(LSTM(units=64, return_sequences=True))) model.add(TimeDistributed(Dense(len(tag2idx), activation="softmax"))) model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) model.summary() model.fit(X, np.array(y), batch_size=2, epochs=10) model.save("NER/ner_bilstm_model.keras") with open("NER/word2idx.pkl", "wb") as f: pickle.dump(word2idx, f) with open("NER/tag2idx.pkl", "wb") as f: pickle.dump(tag2idx, f) y_true = [[idx2tag[np.argmax(token)] for token in seq] for seq in y] y_pred = model.predict(X) y_pred = [[idx2tag[np.argmax(token)] for token in seq] for seq in y_pred] print(classification_report(y_true, y_pred))