fix: adjustment on the file management and the dataset

2025-05-09 23:19:47 +07:00 · 2025-05-09 23:19:47 +07:00 · 20ef6aeaed
parent 647505a8e2
commit 20ef6aeaed
27 changed files with 31304 additions and 57 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,7 @@
 myenv
-.keras
+# Abaikan direktori atau file yang mengandung 'keras' di mana saja dalam namanya
+*keras*
+**/*keras*
+
+# Abaikan semua file dengan ekstensi .pkl
+*.pkl
--- a/NER/ner_bilstm_model.keras
+++ b/NER/ner_bilstm_model.keras
--- a/NER/tag2idx.pkl
+++ b/NER/tag2idx.pkl
--- a/NER/word2idx.pkl
+++ b/NER/word2idx.pkl
--- a/QC/normalize_dataset.json
+++ b/QC/normalize_dataset.json
--- a/QC/qg_train.ipynb
+++ b/QC/qg_train.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
   "id": "9bf2159a",
   "metadata": {},
   "outputs": [
@ -10,18 +10,18 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2025-04-29 19:13:19.968628: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2025-04-29 19:13:19.975821: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2025-04-29 19:13:20.061422: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2025-04-29 19:13:20.109564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2025-05-02 15:16:40.916818: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2025-05-02 15:16:40.923426: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2025-05-02 15:16:40.983217: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2025-05-02 15:16:41.024477: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
-      "E0000 00:00:1745928800.155971  272184 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "E0000 00:00:1745928800.168166  272184 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "W0000 00:00:1745928800.263286  272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "W0000 00:00:1745928800.263312  272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "W0000 00:00:1745928800.263313  272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "W0000 00:00:1745928800.263314  272184 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "2025-04-29 19:13:20.274608: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "E0000 00:00:1746173801.069646    9825 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1746173801.081087    9825 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "W0000 00:00:1746173801.169376    9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "W0000 00:00:1746173801.169393    9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "W0000 00:00:1746173801.169395    9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "W0000 00:00:1746173801.169396    9825 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "2025-05-02 15:16:41.179508: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
@ -51,7 +51,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 5,
   "id": "50118278",
   "metadata": {},
   "outputs": [
@ -60,9 +60,9 @@
     "output_type": "stream",
     "text": [
      "\n",
-      " Jumlah data valid: 287 / 287\n",
+      " Jumlah data valid: 321 / 321\n",
      " Jumlah data tidak valid: 0\n",
-      "Counter({'ftb': 202, 'tof': 45, 'none': 40})\n"
+      "Counter({'ftb': 235, 'tof': 45, 'none': 41})\n"
     ]
    }
   ],
@ -450,7 +450,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "id": "06fd86c7",
   "metadata": {},
   "outputs": [
@ -501,9 +501,7 @@
    "print(\"\\n=== Akurasi Detail ===\")\n",
    "print(f\"Question Accuracy (Token-level): {acc_q:.4f}\")\n",
    "print(f\"Answer Accuracy (Token-level)  : {acc_a:.4f}\")\n",
-    "print(f\"Type Accuracy (Class-level)   : {np.mean(y_true_type == y_pred_type):.2f}\")\n",
-    "# print(\"\\n=== Classification Report (TYPE) ===\")\n",
-    "# print(report_type)"
+    "print(f\"Type Accuracy (Class-level)   : {np.mean(y_true_type == y_pred_type):.2f}\")"
   ]
  },
  {
--- a/dataset/backup_old_dts.tsv
+++ b/dataset/backup_old_dts.tsv
--- a/dataset/dataset_ner_srl.tsv
+++ b/dataset/dataset_ner_srl.tsv
--- a/dataset/new_ner_srl.tsv
+++ b/dataset/new_ner_srl.tsv
--- a/lstm_multi_output_model.keras
+++ b/lstm_multi_output_model.keras
--- a/lstm_multi_output_ner_model.keras
+++ b/lstm_multi_output_ner_model.keras
--- a/old/NER/lstm_ner_qc.py
+++ b/old/NER/lstm_ner_qc.py
--- a/old/NER/test_ner.py
+++ b/old/NER/test_ner.py
--- a/old/accuracy_plot.png
+++ b/old/accuracy_plot.png
--- a/old/loss_plot.png
+++ b/old/loss_plot.png
--- a/old/lses.py
+++ b/old/lses.py
@ -0,0 +1,152 @@
+# ner_srl_multitask.py
+# ----------------------------------------------------------
+# Train a multi‑task (Bi)LSTM that predicts NER + SRL tags
+# ----------------------------------------------------------
+import json, numpy as np, tensorflow as tf
+from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional,
+                                     TimeDistributed, Dense)
+from tensorflow.keras.models import Model
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import to_categorical
+from sklearn.model_selection import train_test_split
+from seqeval.metrics import classification_report
+# ----------------------------------------------------------
+# 1. Load and prepare data
+# ----------------------------------------------------------
+DATA = json.load(open("../dataset/dataset_ner_srl.json", "r", encoding="utf8"))
+
+# --- token vocabulary -------------------------------------------------
+vocab = {"PAD": 0, "UNK": 1}
+for sample in DATA:
+    for tok in sample["tokens"]:
+        vocab.setdefault(tok.lower(), len(vocab))
+
+# --- label maps -------------------------------------------------------
+def build_label_map(key):
+    tags = {"PAD": 0}                          # keep 0 for padding
+    for s in DATA:
+        for t in s[key]:
+            tags.setdefault(t, len(tags))
+    return tags
+
+ner2idx = build_label_map("labels_ner")
+srl2idx = build_label_map("labels_srl")
+idx2ner = {i: t for t, i in ner2idx.items()}
+idx2srl = {i: t for t, i in srl2idx.items()}
+
+# --- sequences --------------------------------------------------------
+MAXLEN = max(len(x["tokens"]) for x in DATA)
+
+X      = [[vocab.get(tok.lower(), vocab["UNK"])     for tok in s["tokens"]]
+          for s in DATA]
+y_ner  = [[ner2idx[t]                                for t   in s["labels_ner"]]
+          for s in DATA]
+y_srl  = [[srl2idx[t]                                for t   in s["labels_srl"]]
+          for s in DATA]
+
+X      = pad_sequences(X,     maxlen=MAXLEN, padding="post", value=vocab["PAD"])
+y_ner  = pad_sequences(y_ner, maxlen=MAXLEN, padding="post", value=ner2idx["PAD"])
+y_srl  = pad_sequences(y_srl, maxlen=MAXLEN, padding="post", value=srl2idx["PAD"])
+
+# --- one‑hot for softmax ---------------------------------------------
+y_ner  = to_categorical(y_ner, num_classes=len(ner2idx))
+y_srl  = to_categorical(y_srl, num_classes=len(srl2idx))
+
+# ----------------------------------------------------------
+# 2. Train / validation split
+# ----------------------------------------------------------
+# *All* arrays must be passed to train_test_split in one call so they
+# stay aligned. Order‑of‑return = train,test for each array.
+X_tr, X_val, y_tr_ner, y_val_ner, y_tr_srl, y_val_srl = train_test_split(
+    X, y_ner, y_srl, test_size=0.15, random_state=42
+)
+
+# ----------------------------------------------------------
+# 3. Model definition
+# ----------------------------------------------------------
+EMB_DIM    = 128
+LSTM_UNITS = 128
+
+inp   = Input(shape=(MAXLEN,))
+emb   = Embedding(len(vocab), EMB_DIM, mask_zero=True)(inp)
+bilstm= Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(emb)
+
+ner_out = TimeDistributed(
+            Dense(len(ner2idx), activation="softmax"), name="ner")(bilstm)
+srl_out = TimeDistributed(
+            Dense(len(srl2idx), activation="softmax"), name="srl")(bilstm)
+
+model = Model(inp, [ner_out, srl_out])
+model.compile(
+    optimizer="adam",
+    loss   ={"ner": "categorical_crossentropy",
+             "srl": "categorical_crossentropy"},
+    metrics={"ner": "accuracy",
+             "srl": "accuracy"}
+)
+model.summary()
+
+# ----------------------------------------------------------
+# 4. Train
+# ----------------------------------------------------------
+history = model.fit(
+    X_tr,
+    {"ner": y_tr_ner, "srl": y_tr_srl},
+    validation_data=(X_val, {"ner": y_val_ner, "srl": y_val_srl}),
+    epochs=15,
+    batch_size=32,
+    verbose=2,
+)
+
+# ----------------------------------------------------------
+# 5. Helper: decode with a mask (so lens always match)
+# ----------------------------------------------------------
+def decode(pred, idx2tag, mask):
+    """
+    pred : [n, MAXLEN, n_tags] (one‑hot or probabilities)
+    mask : [n, MAXLEN]         (True for real tokens, False for PAD)
+    """
+    out = []
+    for seq, m in zip(pred, mask):
+        tags = [idx2tag[np.argmax(tok)] for tok, keep in zip(seq, m) if keep]
+        out.append(tags)
+    return out
+
+# ----------------------------------------------------------
+# 6. Evaluation
+# ----------------------------------------------------------
+y_pred_ner, y_pred_srl = model.predict(X_val, verbose=0)
+
+mask_val = (X_val != vocab["PAD"])                 # True for real tokens
+
+true_ner = decode(y_val_ner , idx2ner, mask_val)
+pred_ner = decode(y_pred_ner, idx2ner, mask_val)
+true_srl = decode(y_val_srl , idx2srl, mask_val)
+pred_srl = decode(y_pred_srl, idx2srl, mask_val)
+
+print("\n📊 NER report")
+print(classification_report(true_ner, pred_ner))
+
+print("\n📊 SRL report")
+print(classification_report(true_srl, pred_srl))
+
+# # ----------------------------------------------------------
+# # 7. Quick inference function
+# # ----------------------------------------------------------
+# def predict_sentence(sentence: str):
+#     tokens = sentence.strip().split()
+#     ids    = [vocab.get(w.lower(), vocab["UNK"]) for w in tokens]
+#     ids    = pad_sequences([ids], maxlen=MAXLEN, padding="post",
+#                            value=vocab["PAD"])
+#     mask   = (ids != vocab["PAD"])
+#     p_ner, p_srl = model.predict(ids, verbose=0)
+#     ner_tags  = decode(p_ner , idx2ner , mask)[0]
+#     srl_tags  = decode(p_srl , idx2srl , mask)[0]
+#     return list(zip(tokens, ner_tags, srl_tags))
+
+# # ---- demo ------------------------------------------------
+# if __name__ == "__main__":
+#     print("\n🔍 Demo:")
+#     for tok, ner, srl in predict_sentence(
+#             "Keanekaragaman hayati Indonesia sangat dipengaruhi faktor iklim."):
+#         print(f"{tok:15}  {ner:10}  {srl}")
--- a/old/lstm_ner_srl.ipynb
+++ b/old/lstm_ner_srl.ipynb
--- a/old/ner_lstm.ipynb
+++ b/old/ner_lstm.ipynb
--- a/old/new_lstm_ner_srl.ipynb
+++ b/old/new_lstm_ner_srl.ipynb
--- a/old/normalize_text/normalize.json
+++ b/old/normalize_text/normalize.json
--- a/old/test.py
+++ b/old/test.py
--- a/old/testing.py
+++ b/old/testing.py
--- a/old/train.py
+++ b/old/train.py
@ -0,0 +1,107 @@
+import json, pickle
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+from seqeval.metrics import classification_report
+
+# ---------- 1. Muat data ----------
+with open("dataset/dataset_ner_srl.json", encoding="utf-8") as f:
+    data = json.load(f)
+
+sentences = [[tok.lower() for tok in item["tokens"]] for item in data]
+labels_ner = [item["labels_ner"] for item in data]
+labels_srl = [item["labels_srl"] for item in data]
+
+for i, label_seq in enumerate(labels_ner):
+    if "V" in label_seq:
+        print(f"Label 'V' ditemukan di index {i}: {label_seq}")
+
+# ---------- 2. Bangun vocab & label map ----------
+words = sorted({w for s in sentences for w in s})
+ner_tags = sorted({t for seq in labels_ner for t in seq})
+srl_tags = sorted({t for seq in labels_srl for t in seq})
+
+word2idx = {w: i + 2 for i, w in enumerate(words)}
+word2idx["PAD"], word2idx["UNK"] = 0, 1
+
+tag2idx_ner = {t: i for i, t in enumerate(ner_tags)}
+tag2idx_srl = {t: i for i, t in enumerate(srl_tags)}
+idx2tag_ner = {i: t for t, i in tag2idx_ner.items()}
+idx2tag_srl = {i: t for t, i in tag2idx_srl.items()}
+
+# ---------- 3. Encoding token & label ----------
+X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in sentences]
+y_ner = [[tag2idx_ner[t] for t in seq] for seq in labels_ner]
+y_srl = [[tag2idx_srl[t] for t in seq] for seq in labels_srl]
+
+maxlen = max(len(seq) for seq in X)
+
+X = pad_sequences(X, maxlen=maxlen, padding="post", value=word2idx["PAD"])
+y_ner = pad_sequences(y_ner, maxlen=maxlen, padding="post", value=tag2idx_ner["O"])
+y_srl = pad_sequences(y_srl, maxlen=maxlen, padding="post", value=tag2idx_srl["O"])
+
+y_ner = [to_categorical(seq, num_classes=len(tag2idx_ner)) for seq in y_ner]
+y_srl = [to_categorical(seq, num_classes=len(tag2idx_srl)) for seq in y_srl]
+
+# cast ke np.array biar Keras happy
+X = np.array(X)
+y_ner = np.array(y_ner)
+y_srl = np.array(y_srl)
+
+# ---------- 4. Arsitektur BiLSTM multi‑task ----------
+input_layer = Input(shape=(maxlen,))
+embed = Embedding(len(word2idx), 64)(input_layer)
+bilstm = Bidirectional(LSTM(64, return_sequences=True))(embed)
+
+ner_output = TimeDistributed(
+    Dense(len(tag2idx_ner), activation="softmax"), name="ner_output"
+)(bilstm)
+srl_output = TimeDistributed(
+    Dense(len(tag2idx_srl), activation="softmax"), name="srl_output"
+)(bilstm)
+
+model = Model(inputs=input_layer, outputs=[ner_output, srl_output])
+model.compile(
+    optimizer="adam",
+    loss={
+        "ner_output": "categorical_crossentropy",
+        "srl_output": "categorical_crossentropy",
+    },
+    metrics={"ner_output": "accuracy", "srl_output": "accuracy"},
+)
+model.summary()
+
+# ---------- 5. Training ----------
+model.fit(
+    X, {"ner_output": y_ner, "srl_output": y_srl}, batch_size=2, epochs=10, verbose=1
+)
+
+# ---------- 6. Simpan artefak ----------
+model.save("NER_SRL/multi_task_bilstm_model.keras")
+with open("NER_SRL/word2idx.pkl", "wb") as f:
+    pickle.dump(word2idx, f)
+with open("NER_SRL/tag2idx_ner.pkl", "wb") as f:
+    pickle.dump(tag2idx_ner, f)
+with open("NER_SRL/tag2idx_srl.pkl", "wb") as f:
+    pickle.dump(tag2idx_srl, f)
+
+# ---------- 7. Evaluasi ----------
+y_pred_ner, y_pred_srl = model.predict(X, verbose=0)
+
+
+def decode(pred, true, idx2tag):
+    true_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in true]
+    pred_tags = [[idx2tag[np.argmax(tok)] for tok in seq] for seq in pred]
+    return true_tags, pred_tags
+
+
+true_ner, pred_ner = decode(y_pred_ner, y_ner, idx2tag_ner)
+true_srl, pred_srl = decode(y_pred_srl, y_srl, idx2tag_srl)
+
+print("\n📊 [NER] Classification Report:")
+print(classification_report(true_ner, pred_ner))
+
+print("\n📊 [SRL] Classification Report:")
+print(classification_report(true_srl, pred_srl))
--- a/old/train_multitask_lstm.py
+++ b/old/train_multitask_lstm.py
--- a/old/training_model.ipynb
+++ b/old/training_model.ipynb
--- a/old/uji.py
+++ b/old/uji.py
--- a/tokenizer.pkl
+++ b/tokenizer.pkl