TIF_E41211115_lstm-quiz-gen.../old/QC/model_tr.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "94d3889b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1746863381.035097  166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1746863381.038978  166971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "W0000 00:00:1746863381.049265  166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1746863381.049288  166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1746863381.049289  166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1746863381.049290  166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "2025-05-10 14:49:41.052642: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "# -------------------------------------------------\n",
    "# 0.  Import & Konfigurasi\n",
    "# -------------------------------------------------\n",
    "import json, pickle\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "from collections import Counter\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "from tensorflow.keras.layers import (\n",
    "    Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,\n",
    "    TimeDistributed\n",
    ")\n",
    "from tensorflow.keras.models import Model\n",
    "from tensorflow.keras.callbacks import EarlyStopping\n",
    "\n",
    "PAD_TOKEN   = \"<PAD>\"\n",
    "UNK_TOKEN   = \"UNK\"\n",
    "START_TOKEN = \"<START>\"\n",
    "END_TOKEN   = \"<END>\"\n",
    "MAXLEN_SRC  = 100      # Panjang paragraf maksimal\n",
    "MAXLEN_TGT  = 40       # Panjang pertanyaan/jawaban maksimal\n",
    "BATCH       = 32\n",
    "EPOCHS      = 30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b528b34e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Valid 325 / 325 (invalid index: [])\n"
     ]
    }
   ],
   "source": [
    "raw = json.loads(Path(\"normalize_dataset.json\").read_text(encoding=\"utf-8\"))\n",
    "\n",
    "req = {\"tokens\",\"ner\",\"srl\",\"question\",\"answer\",\"type\"}\n",
    "valid, bad = [], []\n",
    "for i,item in enumerate(raw):\n",
    "    if (isinstance(item,dict) and not (req-item.keys())\n",
    "        and all(isinstance(item[k],list) for k in req-{\"type\"})\n",
    "        and isinstance(item[\"type\"],str)):\n",
    "        valid.append(item)\n",
    "    else:\n",
    "        bad.append(i)\n",
    "\n",
    "print(f\"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b18e4617",
   "metadata": {},
   "outputs": [],
   "source": [
    "for ex in valid:\n",
    "    ex[\"question_in\"]  = [START_TOKEN] + ex[\"question\"]\n",
    "    ex[\"question_out\"] = ex[\"question\"] + [END_TOKEN]\n",
    "\n",
    "    ex[\"answer_in\"]    = [START_TOKEN] + ex[\"answer\"]\n",
    "    ex[\"answer_out\"]   = ex[\"answer\"] + [END_TOKEN]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "faa30b82",
   "metadata": {},
   "outputs": [],
   "source": [
    "tok_token = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
    "tok_ner   = Tokenizer(lower=False, filters=\"\")\n",
    "tok_srl   = Tokenizer(lower=False, filters=\"\")\n",
    "tok_q     = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
    "tok_a     = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
    "tok_type  = Tokenizer(lower=False, filters=\"\")\n",
    "\n",
    "tok_token.fit_on_texts([ex[\"tokens\"] for ex in valid])\n",
    "tok_ner.fit_on_texts([ex[\"ner\"] for ex in valid])\n",
    "tok_srl.fit_on_texts([ex[\"srl\"] for ex in valid])\n",
    "tok_q.fit_on_texts([ex[\"question_in\"]+ex[\"question_out\"] for ex in valid])\n",
    "tok_a.fit_on_texts([ex[\"answer_in\"]+ex[\"answer_out\"] for ex in valid])\n",
    "tok_type.fit_on_texts([ex[\"type\"] for ex in valid])\n",
    "\n",
    "# +1 utk padding\n",
    "vocab_token = len(tok_token.word_index)+1\n",
    "vocab_ner   = len(tok_ner.word_index)+1\n",
    "vocab_srl   = len(tok_srl.word_index)+1\n",
    "vocab_q     = len(tok_q.word_index)+1\n",
    "vocab_a     = len(tok_a.word_index)+1\n",
    "vocab_type  = len(tok_type.word_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c83ce734",
   "metadata": {},
   "outputs": [],
   "source": [
    "def seqs(field, tok, maxlen):\n",
    "    return pad_sequences(\n",
    "        tok.texts_to_sequences([ex[field] for ex in valid]),\n",
    "        maxlen=maxlen, padding=\"post\"\n",
    "    )\n",
    "\n",
    "X_tok  = seqs(\"tokens\", tok_token, MAXLEN_SRC)\n",
    "X_ner  = seqs(\"ner\", tok_ner, MAXLEN_SRC)\n",
    "X_srl  = seqs(\"srl\", tok_srl, MAXLEN_SRC)\n",
    "\n",
    "Q_in   = seqs(\"question_in\",  tok_q, MAXLEN_TGT)\n",
    "Q_out  = seqs(\"question_out\", tok_q, MAXLEN_TGT)\n",
    "A_in   = seqs(\"answer_in\",    tok_a, MAXLEN_TGT)\n",
    "A_out  = seqs(\"answer_out\",   tok_a, MAXLEN_TGT)\n",
    "\n",
    "y_type = to_categorical(\n",
    "    np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex[\"type\"] for ex in valid])]),\n",
    "    num_classes=vocab_type\n",
    ")\n",
    "\n",
    "# Expand dims → (batch, seq, 1)  agar cocok dgn sparse_cce\n",
    "Q_out = np.expand_dims(Q_out, -1)\n",
    "A_out = np.expand_dims(A_out, -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ad3fe7f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "(X_tok_tr, X_tok_te,\n",
    " X_ner_tr, X_ner_te,\n",
    " X_srl_tr, X_srl_te,\n",
    " Q_in_tr,  Q_in_te,\n",
    " Q_out_tr, Q_out_te,\n",
    " A_in_tr,  A_in_te,\n",
    " A_out_tr, A_out_te,\n",
    " y_type_tr,y_type_te) = train_test_split(\n",
    "     X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,\n",
    "     test_size=0.2, random_state=42\n",
    " )\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f20abfb5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 3)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m      7\u001b[0m emb_srl \u001b[38;5;241m=\u001b[39m Embedding(vocab_srl,   \u001b[38;5;241m16\u001b[39m,  mask_zero\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)(enc_srl)\n\u001b[1;32m      9\u001b[0m enc_cat \u001b[38;5;241m=\u001b[39m Concatenate()([emb_tok, emb_ner, emb_srl])\n\u001b[0;32m---> 10\u001b[0m enc_out, state_h, state_c \u001b[38;5;241m=\u001b[39m Bidirectional(\n\u001b[1;32m     11\u001b[0m     LSTM(\u001b[38;5;241m256\u001b[39m, return_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_sequences\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m     12\u001b[0m )(enc_cat)\n\u001b[1;32m     14\u001b[0m \u001b[38;5;66;03m# ---------- Klasifikasi tipe ----------\u001b[39;00m\n\u001b[1;32m     15\u001b[0m type_out \u001b[38;5;241m=\u001b[39m Dense(vocab_type, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msoftmax\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype_output\u001b[39m\u001b[38;5;124m\"\u001b[39m)(enc_out)\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)"
     ]
    }
   ],
   "source": [
    "enc_tok = Input(shape=(None,), name=\"enc_tok\")\n",
    "enc_ner = Input(shape=(None,), name=\"enc_ner\")\n",
    "enc_srl = Input(shape=(None,), name=\"enc_srl\")\n",
    "\n",
    "emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)\n",
    "emb_ner = Embedding(vocab_ner,   16,  mask_zero=True)(enc_ner)\n",
    "emb_srl = Embedding(vocab_srl,   16,  mask_zero=True)(enc_srl)\n",
    "\n",
    "enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
    "enc_out, state_h, state_c = Bidirectional(\n",
    "    LSTM(256, return_state=True, return_sequences=False)\n",
    ")(enc_cat)\n",
    "\n",
    "# ---------- Klasifikasi tipe ----------\n",
    "type_out = Dense(vocab_type, activation=\"softmax\", name=\"type_output\")(enc_out)\n",
    "\n",
    "# ---------- Decoder QUESTION ----------\n",
    "dec_q_in = Input(shape=(None,), name=\"dec_q_in\")\n",
    "dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)\n",
    "dec_q_lstm = LSTM(256, return_sequences=True)\n",
    "dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])\n",
    "q_out = TimeDistributed(Dense(vocab_q, activation=\"softmax\"), name=\"question_output\")(dec_q_out)\n",
    "\n",
    "# ---------- Decoder ANSWER ----------\n",
    "dec_a_in = Input(shape=(None,), name=\"dec_a_in\")\n",
    "dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)\n",
    "dec_a_lstm = LSTM(256, return_sequences=True)\n",
    "dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])\n",
    "a_out = TimeDistributed(Dense(vocab_a, activation=\"softmax\"), name=\"answer_output\")(dec_a_out)\n",
    "\n",
    "# ---------- Build & compile ----------\n",
    "model = Model(\n",
    "    inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],\n",
    "    outputs=[q_out, a_out, type_out]\n",
    ")\n",
    "\n",
    "model.compile(\n",
    "    optimizer=\"adam\",\n",
    "    loss={\n",
    "        \"question_output\": \"sparse_categorical_crossentropy\",\n",
    "        \"answer_output\"  : \"sparse_categorical_crossentropy\",\n",
    "        \"type_output\"    : \"categorical_crossentropy\"\n",
    "    },\n",
    "    loss_weights={\n",
    "        \"question_output\": 1.0,\n",
    "        \"answer_output\"  : 1.0,\n",
    "        \"type_output\"    : 0.3\n",
    "    },\n",
    "    metrics={\n",
    "        \"question_output\": \"accuracy\",\n",
    "        \"answer_output\"  : \"accuracy\",\n",
    "        \"type_output\"    : \"accuracy\"\n",
    "    }\n",
    ")\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c348406e",
   "metadata": {},
   "outputs": [],
   "source": [
    "early = EarlyStopping(patience=3, restore_best_weights=True)\n",
    "\n",
    "model.fit(\n",
    "    [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],\n",
    "    {\"question_output\": Q_out_tr,\n",
    "     \"answer_output\"  : A_out_tr,\n",
    "     \"type_output\"    : y_type_tr},\n",
    "    batch_size=BATCH,\n",
    "    epochs=EPOCHS,\n",
    "    validation_split=0.1,\n",
    "    callbacks=[early]\n",
    ")\n",
    "\n",
    "# -------------------------------------------------\n",
    "# 8.  Simpan model & tokenizer\n",
    "# -------------------------------------------------\n",
    "model.save(\"qg_multitask.keras\")\n",
    "with open(\"tokenizers.pkl\", \"wb\") as f:\n",
    "    pickle.dump({\n",
    "        \"token\": tok_token,\n",
    "        \"ner\"  : tok_ner,\n",
    "        \"srl\"  : tok_srl,\n",
    "        \"q\"    : tok_q,\n",
    "        \"a\"    : tok_a,\n",
    "        \"type\" : tok_type\n",
    "    }, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}