TIF_E41211115_lstm-quiz-gen.../old/QC/qg_v2_train.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0a2880d7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-04-23 14:22:17.809700: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2025-04-23 14:22:17.810231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-04-23 14:22:17.812492: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-04-23 14:22:17.818482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1745392937.829027   39341 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1745392937.832239   39341 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "W0000 00:00:1745392937.840149   39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1745392937.840163   39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1745392937.840164   39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1745392937.840165   39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "2025-04-23 14:22:17.843058: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Model\n",
    "from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense\n",
    "from tensorflow.keras.callbacks import EarlyStopping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bd82907a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"dataset_qc.json\", encoding=\"utf-8\") as f:\n",
    "    raw_data = json.load(f)\n",
    "\n",
    "tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in raw_data]\n",
    "ner_tags = [item[\"ner\"] for item in raw_data]\n",
    "srl_tags = [item[\"srl\"] for item in raw_data]\n",
    "questions = [item[\"question\"].lower().strip() for item in raw_data]\n",
    "answers = [item[\"answer\"].lower().strip() for item in raw_data]\n",
    "types = [item[\"type\"] for item in raw_data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "946713ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "token_tokenizer = Tokenizer(lower=False, oov_token=\"<OOV>\")\n",
    "token_tokenizer.fit_on_texts(tokens)\n",
    "token_sequences = token_tokenizer.texts_to_sequences(tokens)\n",
    "\n",
    "ner_encoder = LabelEncoder()\n",
    "srl_encoder = LabelEncoder()\n",
    "\n",
    "flat_ner = [tag for seq in ner_tags for tag in seq]\n",
    "flat_srl = [tag for seq in srl_tags for tag in seq]\n",
    "\n",
    "ner_encoder.fit(flat_ner)\n",
    "srl_encoder.fit(flat_srl)\n",
    "\n",
    "ner_sequences = [ner_encoder.transform(seq).tolist() for seq in ner_tags]\n",
    "srl_sequences = [srl_encoder.transform(seq).tolist() for seq in srl_tags]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "aff6e7aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_LEN = max(len(seq) for seq in token_sequences)\n",
    "\n",
    "token_padded = pad_sequences(token_sequences, maxlen=MAX_LEN, padding='post')\n",
    "ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LEN, padding='post')\n",
    "srl_padded = pad_sequences(srl_sequences, maxlen=MAX_LEN, padding='post')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ea2ab113",
   "metadata": {},
   "outputs": [],
   "source": [
    "qa_tokenizer = Tokenizer(oov_token=\"<OOV>\")\n",
    "qa_tokenizer.fit_on_texts(questions + answers)\n",
    "\n",
    "question_sequences = qa_tokenizer.texts_to_sequences(questions)\n",
    "answer_sequences = qa_tokenizer.texts_to_sequences(answers)\n",
    "\n",
    "question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post')\n",
    "answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post')\n",
    "\n",
    "\n",
    "type_encoder = LabelEncoder()\n",
    "type_labels = type_encoder.fit_transform(types)  # bentuk 1D array\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e2becb56",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'Tokenizer' object has no attribute 'shape'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m      5\u001b[0m y_answer \u001b[38;5;241m=\u001b[39m answer_padded\n\u001b[1;32m      6\u001b[0m y_type \u001b[38;5;241m=\u001b[39m type_labels\n\u001b[0;32m---> 10\u001b[0m MAX_LEN \u001b[38;5;241m=\u001b[39m \u001b[43mX_token\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# 2. Parameter\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m     15\u001b[0m VOCAB_TOKEN \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(X_token) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'Tokenizer' object has no attribute 'shape'"
     ]
    }
   ],
   "source": [
    "X_token = token_tokenizer\n",
    "X_ner = ner_encoder\n",
    "X_srl = srl_encoder\n",
    "y_question = qa_tokenizer\n",
    "y_answer = answer_padded\n",
    "y_type = type_labels\n",
    "\n",
    "\n",
    "\n",
    "MAX_LEN = X_token.shape[1]\n",
    "\n",
    "# ======================\n",
    "# 2. Parameter\n",
    "# ======================\n",
    "VOCAB_TOKEN = np.max(X_token) + 1\n",
    "VOCAB_NER = np.max(X_ner) + 1\n",
    "VOCAB_SRL = np.max(X_srl) + 1\n",
    "VOCAB_QA = max(np.max(y_question), np.max(y_answer)) + 1\n",
    "NUM_TYPES = len(np.unique(y_type))\n",
    "\n",
    "EMB_TOKEN = 128\n",
    "EMB_TAG = 16\n",
    "LSTM_UNITS = 256"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "162a155a",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_token = Input(shape=(MAX_LEN,), name=\"token_input\")\n",
    "input_ner = Input(shape=(MAX_LEN,), name=\"ner_input\")\n",
    "input_srl = Input(shape=(MAX_LEN,), name=\"srl_input\")\n",
    "\n",
    "# ======================\n",
    "# 4. Embedding\n",
    "# ======================\n",
    "embed_token = Embedding(input_dim=VOCAB_TOKEN, output_dim=EMB_TOKEN)(input_token)\n",
    "embed_ner = Embedding(input_dim=VOCAB_NER, output_dim=EMB_TAG)(input_ner)\n",
    "embed_srl = Embedding(input_dim=VOCAB_SRL, output_dim=EMB_TAG)(input_srl)\n",
    "\n",
    "# Gabung semua embedding\n",
    "merged = Concatenate()([embed_token, embed_ner, embed_srl])\n",
    "\n",
    "# ======================\n",
    "# 5. LSTM\n",
    "# ======================\n",
    "lstm_out = LSTM(LSTM_UNITS, return_sequences=True)(merged)\n",
    "\n",
    "# Output: Question\n",
    "question_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"question_output\")(lstm_out)\n",
    "\n",
    "# Output: Answer\n",
    "answer_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"answer_output\")(lstm_out)\n",
    "\n",
    "# Output: Type (klasifikasi)\n",
    "type_repr = LSTM(LSTM_UNITS)(merged)  # pakai output dari awal sebelum LSTM pertama\n",
    "type_out = Dense(NUM_TYPES, activation='softmax', name=\"type_output\")(type_repr)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7cccf561",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model(inputs=[input_token, input_ner, input_srl],\n",
    "              outputs=[question_out, answer_out, type_out])\n",
    "\n",
    "model.compile(\n",
    "    optimizer='adam',\n",
    "    loss={\n",
    "        \"question_output\": \"sparse_categorical_crossentropy\",\n",
    "        \"answer_output\": \"sparse_categorical_crossentropy\",\n",
    "        \"type_output\": \"sparse_categorical_crossentropy\",\n",
    "    },\n",
    "    metrics={\n",
    "        \"question_output\": \"accuracy\",\n",
    "        \"answer_output\": \"accuracy\",\n",
    "        \"type_output\": \"accuracy\",\n",
    "    }\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 7. Training\n",
    "# ======================\n",
    "y_question = np.expand_dims(y_question, -1)  # untuk sparse categorical loss\n",
    "y_answer = np.expand_dims(y_answer, -1)\n",
    "\n",
    "earlystop = EarlyStopping(patience=4, restore_best_weights=True)\n",
    "\n",
    "model.fit(\n",
    "    [X_token, X_ner, X_srl],\n",
    "    [y_question, y_answer, y_type],\n",
    "    batch_size=32,\n",
    "    epochs=30,\n",
    "    validation_split=0.1,\n",
    "    callbacks=[earlystop]\n",
    ")\n",
    "\n",
    "# ======================\n",
    "# 8. Simpan Model\n",
    "# ======================\n",
    "model.save(\"model_lstm_qg.h5\")\n",
    "print(\"✅ Training selesai. Model disimpan.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}