{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "0a2880d7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-04-23 14:22:17.809700: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2025-04-23 14:22:17.810231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", "2025-04-23 14:22:17.812492: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", "2025-04-23 14:22:17.818482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1745392937.829027 39341 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1745392937.832239 39341 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "W0000 00:00:1745392937.840149 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1745392937.840163 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1745392937.840164 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "W0000 00:00:1745392937.840165 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n", "2025-04-23 14:22:17.843058: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "import json\n", "import numpy as np\n", "from pathlib import Path\n", "from sklearn.preprocessing import LabelEncoder\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Model\n", "from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense\n", "from tensorflow.keras.callbacks import EarlyStopping" ] }, { "cell_type": "code", "execution_count": 2, "id": "bd82907a", "metadata": {}, "outputs": [], "source": [ "with open(\"dataset_qc.json\", encoding=\"utf-8\") as f:\n", " raw_data = json.load(f)\n", "\n", "tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in raw_data]\n", "ner_tags = [item[\"ner\"] for item in raw_data]\n", "srl_tags = [item[\"srl\"] for item in raw_data]\n", "questions = [item[\"question\"].lower().strip() for item in raw_data]\n", "answers = [item[\"answer\"].lower().strip() for item in raw_data]\n", "types = [item[\"type\"] for item in raw_data]" ] }, { "cell_type": "code", "execution_count": 3, "id": "946713ee", "metadata": {}, "outputs": [], "source": [ "token_tokenizer = Tokenizer(lower=False, oov_token=\"\")\n", "token_tokenizer.fit_on_texts(tokens)\n", "token_sequences = token_tokenizer.texts_to_sequences(tokens)\n", "\n", "ner_encoder = LabelEncoder()\n", "srl_encoder = LabelEncoder()\n", "\n", "flat_ner = [tag for seq in ner_tags for tag in seq]\n", "flat_srl = [tag for seq in srl_tags for tag in seq]\n", "\n", "ner_encoder.fit(flat_ner)\n", "srl_encoder.fit(flat_srl)\n", "\n", "ner_sequences = [ner_encoder.transform(seq).tolist() for seq in ner_tags]\n", "srl_sequences = [srl_encoder.transform(seq).tolist() for seq in srl_tags]" ] }, { "cell_type": "code", "execution_count": 4, "id": "aff6e7aa", "metadata": {}, "outputs": [], "source": [ "MAX_LEN = max(len(seq) for seq in token_sequences)\n", "\n", "token_padded = pad_sequences(token_sequences, maxlen=MAX_LEN, padding='post')\n", "ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LEN, padding='post')\n", "srl_padded = pad_sequences(srl_sequences, maxlen=MAX_LEN, padding='post')" ] }, { "cell_type": "code", "execution_count": 5, "id": "ea2ab113", "metadata": {}, "outputs": [], "source": [ "qa_tokenizer = Tokenizer(oov_token=\"\")\n", "qa_tokenizer.fit_on_texts(questions + answers)\n", "\n", "question_sequences = qa_tokenizer.texts_to_sequences(questions)\n", "answer_sequences = qa_tokenizer.texts_to_sequences(answers)\n", "\n", "question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post')\n", "answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post')\n", "\n", "\n", "type_encoder = LabelEncoder()\n", "type_labels = type_encoder.fit_transform(types) # bentuk 1D array\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "e2becb56", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'Tokenizer' object has no attribute 'shape'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 5\u001b[0m y_answer \u001b[38;5;241m=\u001b[39m answer_padded\n\u001b[1;32m 6\u001b[0m y_type \u001b[38;5;241m=\u001b[39m type_labels\n\u001b[0;32m---> 10\u001b[0m MAX_LEN \u001b[38;5;241m=\u001b[39m \u001b[43mX_token\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# 2. Parameter\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 15\u001b[0m VOCAB_TOKEN \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(X_token) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n", "\u001b[0;31mAttributeError\u001b[0m: 'Tokenizer' object has no attribute 'shape'" ] } ], "source": [ "X_token = token_tokenizer\n", "X_ner = ner_encoder\n", "X_srl = srl_encoder\n", "y_question = qa_tokenizer\n", "y_answer = answer_padded\n", "y_type = type_labels\n", "\n", "\n", "\n", "MAX_LEN = X_token.shape[1]\n", "\n", "# ======================\n", "# 2. Parameter\n", "# ======================\n", "VOCAB_TOKEN = np.max(X_token) + 1\n", "VOCAB_NER = np.max(X_ner) + 1\n", "VOCAB_SRL = np.max(X_srl) + 1\n", "VOCAB_QA = max(np.max(y_question), np.max(y_answer)) + 1\n", "NUM_TYPES = len(np.unique(y_type))\n", "\n", "EMB_TOKEN = 128\n", "EMB_TAG = 16\n", "LSTM_UNITS = 256" ] }, { "cell_type": "code", "execution_count": null, "id": "162a155a", "metadata": {}, "outputs": [], "source": [ "input_token = Input(shape=(MAX_LEN,), name=\"token_input\")\n", "input_ner = Input(shape=(MAX_LEN,), name=\"ner_input\")\n", "input_srl = Input(shape=(MAX_LEN,), name=\"srl_input\")\n", "\n", "# ======================\n", "# 4. Embedding\n", "# ======================\n", "embed_token = Embedding(input_dim=VOCAB_TOKEN, output_dim=EMB_TOKEN)(input_token)\n", "embed_ner = Embedding(input_dim=VOCAB_NER, output_dim=EMB_TAG)(input_ner)\n", "embed_srl = Embedding(input_dim=VOCAB_SRL, output_dim=EMB_TAG)(input_srl)\n", "\n", "# Gabung semua embedding\n", "merged = Concatenate()([embed_token, embed_ner, embed_srl])\n", "\n", "# ======================\n", "# 5. LSTM\n", "# ======================\n", "lstm_out = LSTM(LSTM_UNITS, return_sequences=True)(merged)\n", "\n", "# Output: Question\n", "question_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"question_output\")(lstm_out)\n", "\n", "# Output: Answer\n", "answer_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"answer_output\")(lstm_out)\n", "\n", "# Output: Type (klasifikasi)\n", "type_repr = LSTM(LSTM_UNITS)(merged) # pakai output dari awal sebelum LSTM pertama\n", "type_out = Dense(NUM_TYPES, activation='softmax', name=\"type_output\")(type_repr)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7cccf561", "metadata": {}, "outputs": [], "source": [ "model = Model(inputs=[input_token, input_ner, input_srl],\n", " outputs=[question_out, answer_out, type_out])\n", "\n", "model.compile(\n", " optimizer='adam',\n", " loss={\n", " \"question_output\": \"sparse_categorical_crossentropy\",\n", " \"answer_output\": \"sparse_categorical_crossentropy\",\n", " \"type_output\": \"sparse_categorical_crossentropy\",\n", " },\n", " metrics={\n", " \"question_output\": \"accuracy\",\n", " \"answer_output\": \"accuracy\",\n", " \"type_output\": \"accuracy\",\n", " }\n", ")\n", "\n", "# ======================\n", "# 7. Training\n", "# ======================\n", "y_question = np.expand_dims(y_question, -1) # untuk sparse categorical loss\n", "y_answer = np.expand_dims(y_answer, -1)\n", "\n", "earlystop = EarlyStopping(patience=4, restore_best_weights=True)\n", "\n", "model.fit(\n", " [X_token, X_ner, X_srl],\n", " [y_question, y_answer, y_type],\n", " batch_size=32,\n", " epochs=30,\n", " validation_split=0.1,\n", " callbacks=[earlystop]\n", ")\n", "\n", "# ======================\n", "# 8. Simpan Model\n", "# ======================\n", "model.save(\"model_lstm_qg.h5\")\n", "print(\"✅ Training selesai. Model disimpan.\")" ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }