{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "94d3889b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", "2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", "2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1746863381.035097 166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1746863381.038978 166971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "W0000 00:00:1746863381.049265 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1746863381.049288 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1746863381.049289 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1746863381.049290 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "2025-05-10 14:49:41.052642: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "# -------------------------------------------------\n", "# 0. Import & Konfigurasi\n", "# -------------------------------------------------\n", "import json, pickle\n", "import numpy as np\n", "from pathlib import Path\n", "from collections import Counter\n", "from sklearn.model_selection import train_test_split\n", "\n", "import tensorflow as tf\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.utils import to_categorical\n", "from tensorflow.keras.layers import (\n", " Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,\n", " TimeDistributed\n", ")\n", "from tensorflow.keras.models import Model\n", "from tensorflow.keras.callbacks import EarlyStopping\n", "\n", "PAD_TOKEN = \"\"\n", "UNK_TOKEN = \"UNK\"\n", "START_TOKEN = \"\"\n", "END_TOKEN = \"\"\n", "MAXLEN_SRC = 100 # Panjang paragraf maksimal\n", "MAXLEN_TGT = 40 # Panjang pertanyaan/jawaban maksimal\n", "BATCH = 32\n", "EPOCHS = 30" ] }, { "cell_type": "code", "execution_count": 2, "id": "b528b34e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Valid 325 / 325 (invalid index: [])\n" ] } ], "source": [ "raw = json.loads(Path(\"normalize_dataset.json\").read_text(encoding=\"utf-8\"))\n", "\n", "req = {\"tokens\",\"ner\",\"srl\",\"question\",\"answer\",\"type\"}\n", "valid, bad = [], []\n", "for i,item in enumerate(raw):\n", " if (isinstance(item,dict) and not (req-item.keys())\n", " and all(isinstance(item[k],list) for k in req-{\"type\"})\n", " and isinstance(item[\"type\"],str)):\n", " valid.append(item)\n", " else:\n", " bad.append(i)\n", "\n", "print(f\"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "b18e4617", "metadata": {}, "outputs": [], "source": [ "for ex in valid:\n", " ex[\"question_in\"] = [START_TOKEN] + ex[\"question\"]\n", " ex[\"question_out\"] = ex[\"question\"] + [END_TOKEN]\n", "\n", " ex[\"answer_in\"] = [START_TOKEN] + ex[\"answer\"]\n", " ex[\"answer_out\"] = ex[\"answer\"] + [END_TOKEN]" ] }, { "cell_type": "code", "execution_count": 4, "id": "faa30b82", "metadata": {}, "outputs": [], "source": [ "tok_token = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n", "tok_ner = Tokenizer(lower=False, filters=\"\")\n", "tok_srl = Tokenizer(lower=False, filters=\"\")\n", "tok_q = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n", "tok_a = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n", "tok_type = Tokenizer(lower=False, filters=\"\")\n", "\n", "tok_token.fit_on_texts([ex[\"tokens\"] for ex in valid])\n", "tok_ner.fit_on_texts([ex[\"ner\"] for ex in valid])\n", "tok_srl.fit_on_texts([ex[\"srl\"] for ex in valid])\n", "tok_q.fit_on_texts([ex[\"question_in\"]+ex[\"question_out\"] for ex in valid])\n", "tok_a.fit_on_texts([ex[\"answer_in\"]+ex[\"answer_out\"] for ex in valid])\n", "tok_type.fit_on_texts([ex[\"type\"] for ex in valid])\n", "\n", "# +1 utk padding\n", "vocab_token = len(tok_token.word_index)+1\n", "vocab_ner = len(tok_ner.word_index)+1\n", "vocab_srl = len(tok_srl.word_index)+1\n", "vocab_q = len(tok_q.word_index)+1\n", "vocab_a = len(tok_a.word_index)+1\n", "vocab_type = len(tok_type.word_index)" ] }, { "cell_type": "code", "execution_count": 5, "id": "c83ce734", "metadata": {}, "outputs": [], "source": [ "def seqs(field, tok, maxlen):\n", " return pad_sequences(\n", " tok.texts_to_sequences([ex[field] for ex in valid]),\n", " maxlen=maxlen, padding=\"post\"\n", " )\n", "\n", "X_tok = seqs(\"tokens\", tok_token, MAXLEN_SRC)\n", "X_ner = seqs(\"ner\", tok_ner, MAXLEN_SRC)\n", "X_srl = seqs(\"srl\", tok_srl, MAXLEN_SRC)\n", "\n", "Q_in = seqs(\"question_in\", tok_q, MAXLEN_TGT)\n", "Q_out = seqs(\"question_out\", tok_q, MAXLEN_TGT)\n", "A_in = seqs(\"answer_in\", tok_a, MAXLEN_TGT)\n", "A_out = seqs(\"answer_out\", tok_a, MAXLEN_TGT)\n", "\n", "y_type = to_categorical(\n", " np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex[\"type\"] for ex in valid])]),\n", " num_classes=vocab_type\n", ")\n", "\n", "# Expand dims → (batch, seq, 1) agar cocok dgn sparse_cce\n", "Q_out = np.expand_dims(Q_out, -1)\n", "A_out = np.expand_dims(A_out, -1)" ] }, { "cell_type": "code", "execution_count": 6, "id": "ad3fe7f2", "metadata": {}, "outputs": [], "source": [ "(X_tok_tr, X_tok_te,\n", " X_ner_tr, X_ner_te,\n", " X_srl_tr, X_srl_te,\n", " Q_in_tr, Q_in_te,\n", " Q_out_tr, Q_out_te,\n", " A_in_tr, A_in_te,\n", " A_out_tr, A_out_te,\n", " y_type_tr,y_type_te) = train_test_split(\n", " X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,\n", " test_size=0.2, random_state=42\n", " )\n", " " ] }, { "cell_type": "code", "execution_count": 7, "id": "f20abfb5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n" ] }, { "ename": "ValueError", "evalue": "too many values to unpack (expected 3)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m emb_srl \u001b[38;5;241m=\u001b[39m Embedding(vocab_srl, \u001b[38;5;241m16\u001b[39m, mask_zero\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)(enc_srl)\n\u001b[1;32m 9\u001b[0m enc_cat \u001b[38;5;241m=\u001b[39m Concatenate()([emb_tok, emb_ner, emb_srl])\n\u001b[0;32m---> 10\u001b[0m enc_out, state_h, state_c \u001b[38;5;241m=\u001b[39m Bidirectional(\n\u001b[1;32m 11\u001b[0m LSTM(\u001b[38;5;241m256\u001b[39m, return_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_sequences\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 12\u001b[0m )(enc_cat)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ---------- Klasifikasi tipe ----------\u001b[39;00m\n\u001b[1;32m 15\u001b[0m type_out \u001b[38;5;241m=\u001b[39m Dense(vocab_type, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msoftmax\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype_output\u001b[39m\u001b[38;5;124m\"\u001b[39m)(enc_out)\n", "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)" ] } ], "source": [ "enc_tok = Input(shape=(None,), name=\"enc_tok\")\n", "enc_ner = Input(shape=(None,), name=\"enc_ner\")\n", "enc_srl = Input(shape=(None,), name=\"enc_srl\")\n", "\n", "emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)\n", "emb_ner = Embedding(vocab_ner, 16, mask_zero=True)(enc_ner)\n", "emb_srl = Embedding(vocab_srl, 16, mask_zero=True)(enc_srl)\n", "\n", "enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])\n", "enc_out, state_h, state_c = Bidirectional(\n", " LSTM(256, return_state=True, return_sequences=False)\n", ")(enc_cat)\n", "\n", "# ---------- Klasifikasi tipe ----------\n", "type_out = Dense(vocab_type, activation=\"softmax\", name=\"type_output\")(enc_out)\n", "\n", "# ---------- Decoder QUESTION ----------\n", "dec_q_in = Input(shape=(None,), name=\"dec_q_in\")\n", "dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)\n", "dec_q_lstm = LSTM(256, return_sequences=True)\n", "dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])\n", "q_out = TimeDistributed(Dense(vocab_q, activation=\"softmax\"), name=\"question_output\")(dec_q_out)\n", "\n", "# ---------- Decoder ANSWER ----------\n", "dec_a_in = Input(shape=(None,), name=\"dec_a_in\")\n", "dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)\n", "dec_a_lstm = LSTM(256, return_sequences=True)\n", "dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])\n", "a_out = TimeDistributed(Dense(vocab_a, activation=\"softmax\"), name=\"answer_output\")(dec_a_out)\n", "\n", "# ---------- Build & compile ----------\n", "model = Model(\n", " inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],\n", " outputs=[q_out, a_out, type_out]\n", ")\n", "\n", "model.compile(\n", " optimizer=\"adam\",\n", " loss={\n", " \"question_output\": \"sparse_categorical_crossentropy\",\n", " \"answer_output\" : \"sparse_categorical_crossentropy\",\n", " \"type_output\" : \"categorical_crossentropy\"\n", " },\n", " loss_weights={\n", " \"question_output\": 1.0,\n", " \"answer_output\" : 1.0,\n", " \"type_output\" : 0.3\n", " },\n", " metrics={\n", " \"question_output\": \"accuracy\",\n", " \"answer_output\" : \"accuracy\",\n", " \"type_output\" : \"accuracy\"\n", " }\n", ")\n", "\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "c348406e", "metadata": {}, "outputs": [], "source": [ "early = EarlyStopping(patience=3, restore_best_weights=True)\n", "\n", "model.fit(\n", " [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],\n", " {\"question_output\": Q_out_tr,\n", " \"answer_output\" : A_out_tr,\n", " \"type_output\" : y_type_tr},\n", " batch_size=BATCH,\n", " epochs=EPOCHS,\n", " validation_split=0.1,\n", " callbacks=[early]\n", ")\n", "\n", "# -------------------------------------------------\n", "# 8. Simpan model & tokenizer\n", "# -------------------------------------------------\n", "model.save(\"qg_multitask.keras\")\n", "with open(\"tokenizers.pkl\", \"wb\") as f:\n", " pickle.dump({\n", " \"token\": tok_token,\n", " \"ner\" : tok_ner,\n", " \"srl\" : tok_srl,\n", " \"q\" : tok_q,\n", " \"a\" : tok_a,\n", " \"type\" : tok_type\n", " }, f)" ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }