270 lines
11 KiB
Plaintext
270 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "0a2880d7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2025-04-23 14:22:17.809700: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
|
"2025-04-23 14:22:17.810231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
|
"2025-04-23 14:22:17.812492: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
|
"2025-04-23 14:22:17.818482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
|
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
|
"E0000 00:00:1745392937.829027 39341 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
|
"E0000 00:00:1745392937.832239 39341 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
|
"W0000 00:00:1745392937.840149 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1745392937.840163 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1745392937.840164 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1745392937.840165 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"2025-04-23 14:22:17.843058: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
|
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import json\n",
|
|
"import numpy as np\n",
|
|
"from pathlib import Path\n",
|
|
"from sklearn.preprocessing import LabelEncoder\n",
|
|
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
|
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
|
"from tensorflow.keras.models import Model\n",
|
|
"from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense\n",
|
|
"from tensorflow.keras.callbacks import EarlyStopping"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "bd82907a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"dataset_qc.json\", encoding=\"utf-8\") as f:\n",
|
|
" raw_data = json.load(f)\n",
|
|
"\n",
|
|
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in raw_data]\n",
|
|
"ner_tags = [item[\"ner\"] for item in raw_data]\n",
|
|
"srl_tags = [item[\"srl\"] for item in raw_data]\n",
|
|
"questions = [item[\"question\"].lower().strip() for item in raw_data]\n",
|
|
"answers = [item[\"answer\"].lower().strip() for item in raw_data]\n",
|
|
"types = [item[\"type\"] for item in raw_data]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "946713ee",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"token_tokenizer = Tokenizer(lower=False, oov_token=\"<OOV>\")\n",
|
|
"token_tokenizer.fit_on_texts(tokens)\n",
|
|
"token_sequences = token_tokenizer.texts_to_sequences(tokens)\n",
|
|
"\n",
|
|
"ner_encoder = LabelEncoder()\n",
|
|
"srl_encoder = LabelEncoder()\n",
|
|
"\n",
|
|
"flat_ner = [tag for seq in ner_tags for tag in seq]\n",
|
|
"flat_srl = [tag for seq in srl_tags for tag in seq]\n",
|
|
"\n",
|
|
"ner_encoder.fit(flat_ner)\n",
|
|
"srl_encoder.fit(flat_srl)\n",
|
|
"\n",
|
|
"ner_sequences = [ner_encoder.transform(seq).tolist() for seq in ner_tags]\n",
|
|
"srl_sequences = [srl_encoder.transform(seq).tolist() for seq in srl_tags]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "aff6e7aa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"MAX_LEN = max(len(seq) for seq in token_sequences)\n",
|
|
"\n",
|
|
"token_padded = pad_sequences(token_sequences, maxlen=MAX_LEN, padding='post')\n",
|
|
"ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LEN, padding='post')\n",
|
|
"srl_padded = pad_sequences(srl_sequences, maxlen=MAX_LEN, padding='post')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "ea2ab113",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"qa_tokenizer = Tokenizer(oov_token=\"<OOV>\")\n",
|
|
"qa_tokenizer.fit_on_texts(questions + answers)\n",
|
|
"\n",
|
|
"question_sequences = qa_tokenizer.texts_to_sequences(questions)\n",
|
|
"answer_sequences = qa_tokenizer.texts_to_sequences(answers)\n",
|
|
"\n",
|
|
"question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post')\n",
|
|
"answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post')\n",
|
|
"\n",
|
|
"\n",
|
|
"type_encoder = LabelEncoder()\n",
|
|
"type_labels = type_encoder.fit_transform(types) # bentuk 1D array\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "e2becb56",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "AttributeError",
|
|
"evalue": "'Tokenizer' object has no attribute 'shape'",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 5\u001b[0m y_answer \u001b[38;5;241m=\u001b[39m answer_padded\n\u001b[1;32m 6\u001b[0m y_type \u001b[38;5;241m=\u001b[39m type_labels\n\u001b[0;32m---> 10\u001b[0m MAX_LEN \u001b[38;5;241m=\u001b[39m \u001b[43mX_token\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# 2. Parameter\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 15\u001b[0m VOCAB_TOKEN \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(X_token) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
|
|
"\u001b[0;31mAttributeError\u001b[0m: 'Tokenizer' object has no attribute 'shape'"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"X_token = token_tokenizer\n",
|
|
"X_ner = ner_encoder\n",
|
|
"X_srl = srl_encoder\n",
|
|
"y_question = qa_tokenizer\n",
|
|
"y_answer = answer_padded\n",
|
|
"y_type = type_labels\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"MAX_LEN = X_token.shape[1]\n",
|
|
"\n",
|
|
"# ======================\n",
|
|
"# 2. Parameter\n",
|
|
"# ======================\n",
|
|
"VOCAB_TOKEN = np.max(X_token) + 1\n",
|
|
"VOCAB_NER = np.max(X_ner) + 1\n",
|
|
"VOCAB_SRL = np.max(X_srl) + 1\n",
|
|
"VOCAB_QA = max(np.max(y_question), np.max(y_answer)) + 1\n",
|
|
"NUM_TYPES = len(np.unique(y_type))\n",
|
|
"\n",
|
|
"EMB_TOKEN = 128\n",
|
|
"EMB_TAG = 16\n",
|
|
"LSTM_UNITS = 256"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "162a155a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"input_token = Input(shape=(MAX_LEN,), name=\"token_input\")\n",
|
|
"input_ner = Input(shape=(MAX_LEN,), name=\"ner_input\")\n",
|
|
"input_srl = Input(shape=(MAX_LEN,), name=\"srl_input\")\n",
|
|
"\n",
|
|
"# ======================\n",
|
|
"# 4. Embedding\n",
|
|
"# ======================\n",
|
|
"embed_token = Embedding(input_dim=VOCAB_TOKEN, output_dim=EMB_TOKEN)(input_token)\n",
|
|
"embed_ner = Embedding(input_dim=VOCAB_NER, output_dim=EMB_TAG)(input_ner)\n",
|
|
"embed_srl = Embedding(input_dim=VOCAB_SRL, output_dim=EMB_TAG)(input_srl)\n",
|
|
"\n",
|
|
"# Gabung semua embedding\n",
|
|
"merged = Concatenate()([embed_token, embed_ner, embed_srl])\n",
|
|
"\n",
|
|
"# ======================\n",
|
|
"# 5. LSTM\n",
|
|
"# ======================\n",
|
|
"lstm_out = LSTM(LSTM_UNITS, return_sequences=True)(merged)\n",
|
|
"\n",
|
|
"# Output: Question\n",
|
|
"question_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"question_output\")(lstm_out)\n",
|
|
"\n",
|
|
"# Output: Answer\n",
|
|
"answer_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"answer_output\")(lstm_out)\n",
|
|
"\n",
|
|
"# Output: Type (klasifikasi)\n",
|
|
"type_repr = LSTM(LSTM_UNITS)(merged) # pakai output dari awal sebelum LSTM pertama\n",
|
|
"type_out = Dense(NUM_TYPES, activation='softmax', name=\"type_output\")(type_repr)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7cccf561",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model = Model(inputs=[input_token, input_ner, input_srl],\n",
|
|
" outputs=[question_out, answer_out, type_out])\n",
|
|
"\n",
|
|
"model.compile(\n",
|
|
" optimizer='adam',\n",
|
|
" loss={\n",
|
|
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
|
|
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
|
|
" \"type_output\": \"sparse_categorical_crossentropy\",\n",
|
|
" },\n",
|
|
" metrics={\n",
|
|
" \"question_output\": \"accuracy\",\n",
|
|
" \"answer_output\": \"accuracy\",\n",
|
|
" \"type_output\": \"accuracy\",\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"# ======================\n",
|
|
"# 7. Training\n",
|
|
"# ======================\n",
|
|
"y_question = np.expand_dims(y_question, -1) # untuk sparse categorical loss\n",
|
|
"y_answer = np.expand_dims(y_answer, -1)\n",
|
|
"\n",
|
|
"earlystop = EarlyStopping(patience=4, restore_best_weights=True)\n",
|
|
"\n",
|
|
"model.fit(\n",
|
|
" [X_token, X_ner, X_srl],\n",
|
|
" [y_question, y_answer, y_type],\n",
|
|
" batch_size=32,\n",
|
|
" epochs=30,\n",
|
|
" validation_split=0.1,\n",
|
|
" callbacks=[earlystop]\n",
|
|
")\n",
|
|
"\n",
|
|
"# ======================\n",
|
|
"# 8. Simpan Model\n",
|
|
"# ======================\n",
|
|
"model.save(\"model_lstm_qg.h5\")\n",
|
|
"print(\"✅ Training selesai. Model disimpan.\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "myenv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|