TIF_E41211115_lstm-quiz-gen.../old/QC/qg_v2_train.ipynb

270 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0a2880d7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-04-23 14:22:17.809700: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-04-23 14:22:17.810231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-04-23 14:22:17.812492: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-04-23 14:22:17.818482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1745392937.829027 39341 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1745392937.832239 39341 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1745392937.840149 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1745392937.840163 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1745392937.840164 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1745392937.840165 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-04-23 14:22:17.843058: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"import json\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense\n",
"from tensorflow.keras.callbacks import EarlyStopping"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bd82907a",
"metadata": {},
"outputs": [],
"source": [
"with open(\"dataset_qc.json\", encoding=\"utf-8\") as f:\n",
" raw_data = json.load(f)\n",
"\n",
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in raw_data]\n",
"ner_tags = [item[\"ner\"] for item in raw_data]\n",
"srl_tags = [item[\"srl\"] for item in raw_data]\n",
"questions = [item[\"question\"].lower().strip() for item in raw_data]\n",
"answers = [item[\"answer\"].lower().strip() for item in raw_data]\n",
"types = [item[\"type\"] for item in raw_data]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "946713ee",
"metadata": {},
"outputs": [],
"source": [
"token_tokenizer = Tokenizer(lower=False, oov_token=\"<OOV>\")\n",
"token_tokenizer.fit_on_texts(tokens)\n",
"token_sequences = token_tokenizer.texts_to_sequences(tokens)\n",
"\n",
"ner_encoder = LabelEncoder()\n",
"srl_encoder = LabelEncoder()\n",
"\n",
"flat_ner = [tag for seq in ner_tags for tag in seq]\n",
"flat_srl = [tag for seq in srl_tags for tag in seq]\n",
"\n",
"ner_encoder.fit(flat_ner)\n",
"srl_encoder.fit(flat_srl)\n",
"\n",
"ner_sequences = [ner_encoder.transform(seq).tolist() for seq in ner_tags]\n",
"srl_sequences = [srl_encoder.transform(seq).tolist() for seq in srl_tags]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "aff6e7aa",
"metadata": {},
"outputs": [],
"source": [
"MAX_LEN = max(len(seq) for seq in token_sequences)\n",
"\n",
"token_padded = pad_sequences(token_sequences, maxlen=MAX_LEN, padding='post')\n",
"ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LEN, padding='post')\n",
"srl_padded = pad_sequences(srl_sequences, maxlen=MAX_LEN, padding='post')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ea2ab113",
"metadata": {},
"outputs": [],
"source": [
"qa_tokenizer = Tokenizer(oov_token=\"<OOV>\")\n",
"qa_tokenizer.fit_on_texts(questions + answers)\n",
"\n",
"question_sequences = qa_tokenizer.texts_to_sequences(questions)\n",
"answer_sequences = qa_tokenizer.texts_to_sequences(answers)\n",
"\n",
"question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post')\n",
"answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post')\n",
"\n",
"\n",
"type_encoder = LabelEncoder()\n",
"type_labels = type_encoder.fit_transform(types) # bentuk 1D array\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2becb56",
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'Tokenizer' object has no attribute 'shape'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 5\u001b[0m y_answer \u001b[38;5;241m=\u001b[39m answer_padded\n\u001b[1;32m 6\u001b[0m y_type \u001b[38;5;241m=\u001b[39m type_labels\n\u001b[0;32m---> 10\u001b[0m MAX_LEN \u001b[38;5;241m=\u001b[39m \u001b[43mX_token\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# 2. Parameter\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 15\u001b[0m VOCAB_TOKEN \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(X_token) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'Tokenizer' object has no attribute 'shape'"
]
}
],
"source": [
"X_token = token_tokenizer\n",
"X_ner = ner_encoder\n",
"X_srl = srl_encoder\n",
"y_question = qa_tokenizer\n",
"y_answer = answer_padded\n",
"y_type = type_labels\n",
"\n",
"\n",
"\n",
"MAX_LEN = X_token.shape[1]\n",
"\n",
"# ======================\n",
"# 2. Parameter\n",
"# ======================\n",
"VOCAB_TOKEN = np.max(X_token) + 1\n",
"VOCAB_NER = np.max(X_ner) + 1\n",
"VOCAB_SRL = np.max(X_srl) + 1\n",
"VOCAB_QA = max(np.max(y_question), np.max(y_answer)) + 1\n",
"NUM_TYPES = len(np.unique(y_type))\n",
"\n",
"EMB_TOKEN = 128\n",
"EMB_TAG = 16\n",
"LSTM_UNITS = 256"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "162a155a",
"metadata": {},
"outputs": [],
"source": [
"input_token = Input(shape=(MAX_LEN,), name=\"token_input\")\n",
"input_ner = Input(shape=(MAX_LEN,), name=\"ner_input\")\n",
"input_srl = Input(shape=(MAX_LEN,), name=\"srl_input\")\n",
"\n",
"# ======================\n",
"# 4. Embedding\n",
"# ======================\n",
"embed_token = Embedding(input_dim=VOCAB_TOKEN, output_dim=EMB_TOKEN)(input_token)\n",
"embed_ner = Embedding(input_dim=VOCAB_NER, output_dim=EMB_TAG)(input_ner)\n",
"embed_srl = Embedding(input_dim=VOCAB_SRL, output_dim=EMB_TAG)(input_srl)\n",
"\n",
"# Gabung semua embedding\n",
"merged = Concatenate()([embed_token, embed_ner, embed_srl])\n",
"\n",
"# ======================\n",
"# 5. LSTM\n",
"# ======================\n",
"lstm_out = LSTM(LSTM_UNITS, return_sequences=True)(merged)\n",
"\n",
"# Output: Question\n",
"question_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"question_output\")(lstm_out)\n",
"\n",
"# Output: Answer\n",
"answer_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"answer_output\")(lstm_out)\n",
"\n",
"# Output: Type (klasifikasi)\n",
"type_repr = LSTM(LSTM_UNITS)(merged) # pakai output dari awal sebelum LSTM pertama\n",
"type_out = Dense(NUM_TYPES, activation='softmax', name=\"type_output\")(type_repr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cccf561",
"metadata": {},
"outputs": [],
"source": [
"model = Model(inputs=[input_token, input_ner, input_srl],\n",
" outputs=[question_out, answer_out, type_out])\n",
"\n",
"model.compile(\n",
" optimizer='adam',\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
" \"type_output\": \"sparse_categorical_crossentropy\",\n",
" },\n",
" metrics={\n",
" \"question_output\": \"accuracy\",\n",
" \"answer_output\": \"accuracy\",\n",
" \"type_output\": \"accuracy\",\n",
" }\n",
")\n",
"\n",
"# ======================\n",
"# 7. Training\n",
"# ======================\n",
"y_question = np.expand_dims(y_question, -1) # untuk sparse categorical loss\n",
"y_answer = np.expand_dims(y_answer, -1)\n",
"\n",
"earlystop = EarlyStopping(patience=4, restore_best_weights=True)\n",
"\n",
"model.fit(\n",
" [X_token, X_ner, X_srl],\n",
" [y_question, y_answer, y_type],\n",
" batch_size=32,\n",
" epochs=30,\n",
" validation_split=0.1,\n",
" callbacks=[earlystop]\n",
")\n",
"\n",
"# ======================\n",
"# 8. Simpan Model\n",
"# ======================\n",
"model.save(\"model_lstm_qg.h5\")\n",
"print(\"✅ Training selesai. Model disimpan.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}