330 lines
14 KiB
Plaintext
330 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "94d3889b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
|
|
"2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
|
"2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
|
|
"2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
|
|
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
|
|
"E0000 00:00:1746863381.035097 166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
|
|
"E0000 00:00:1746863381.038978 166971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
|
|
"W0000 00:00:1746863381.049265 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1746863381.049288 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1746863381.049289 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"W0000 00:00:1746863381.049290 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
|
|
"2025-05-10 14:49:41.052642: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
|
|
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# -------------------------------------------------\n",
|
|
"# 0. Import & Konfigurasi\n",
|
|
"# -------------------------------------------------\n",
|
|
"import json, pickle\n",
|
|
"import numpy as np\n",
|
|
"from pathlib import Path\n",
|
|
"from collections import Counter\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"import tensorflow as tf\n",
|
|
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
|
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
|
"from tensorflow.keras.utils import to_categorical\n",
|
|
"from tensorflow.keras.layers import (\n",
|
|
" Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,\n",
|
|
" TimeDistributed\n",
|
|
")\n",
|
|
"from tensorflow.keras.models import Model\n",
|
|
"from tensorflow.keras.callbacks import EarlyStopping\n",
|
|
"\n",
|
|
"PAD_TOKEN = \"<PAD>\"\n",
|
|
"UNK_TOKEN = \"UNK\"\n",
|
|
"START_TOKEN = \"<START>\"\n",
|
|
"END_TOKEN = \"<END>\"\n",
|
|
"MAXLEN_SRC = 100 # Panjang paragraf maksimal\n",
|
|
"MAXLEN_TGT = 40 # Panjang pertanyaan/jawaban maksimal\n",
|
|
"BATCH = 32\n",
|
|
"EPOCHS = 30"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "b528b34e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Valid 325 / 325 (invalid index: [])\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"raw = json.loads(Path(\"normalize_dataset.json\").read_text(encoding=\"utf-8\"))\n",
|
|
"\n",
|
|
"req = {\"tokens\",\"ner\",\"srl\",\"question\",\"answer\",\"type\"}\n",
|
|
"valid, bad = [], []\n",
|
|
"for i,item in enumerate(raw):\n",
|
|
" if (isinstance(item,dict) and not (req-item.keys())\n",
|
|
" and all(isinstance(item[k],list) for k in req-{\"type\"})\n",
|
|
" and isinstance(item[\"type\"],str)):\n",
|
|
" valid.append(item)\n",
|
|
" else:\n",
|
|
" bad.append(i)\n",
|
|
"\n",
|
|
"print(f\"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "b18e4617",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"for ex in valid:\n",
|
|
" ex[\"question_in\"] = [START_TOKEN] + ex[\"question\"]\n",
|
|
" ex[\"question_out\"] = ex[\"question\"] + [END_TOKEN]\n",
|
|
"\n",
|
|
" ex[\"answer_in\"] = [START_TOKEN] + ex[\"answer\"]\n",
|
|
" ex[\"answer_out\"] = ex[\"answer\"] + [END_TOKEN]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "faa30b82",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tok_token = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
|
|
"tok_ner = Tokenizer(lower=False, filters=\"\")\n",
|
|
"tok_srl = Tokenizer(lower=False, filters=\"\")\n",
|
|
"tok_q = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
|
|
"tok_a = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
|
|
"tok_type = Tokenizer(lower=False, filters=\"\")\n",
|
|
"\n",
|
|
"tok_token.fit_on_texts([ex[\"tokens\"] for ex in valid])\n",
|
|
"tok_ner.fit_on_texts([ex[\"ner\"] for ex in valid])\n",
|
|
"tok_srl.fit_on_texts([ex[\"srl\"] for ex in valid])\n",
|
|
"tok_q.fit_on_texts([ex[\"question_in\"]+ex[\"question_out\"] for ex in valid])\n",
|
|
"tok_a.fit_on_texts([ex[\"answer_in\"]+ex[\"answer_out\"] for ex in valid])\n",
|
|
"tok_type.fit_on_texts([ex[\"type\"] for ex in valid])\n",
|
|
"\n",
|
|
"# +1 utk padding\n",
|
|
"vocab_token = len(tok_token.word_index)+1\n",
|
|
"vocab_ner = len(tok_ner.word_index)+1\n",
|
|
"vocab_srl = len(tok_srl.word_index)+1\n",
|
|
"vocab_q = len(tok_q.word_index)+1\n",
|
|
"vocab_a = len(tok_a.word_index)+1\n",
|
|
"vocab_type = len(tok_type.word_index)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "c83ce734",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def seqs(field, tok, maxlen):\n",
|
|
" return pad_sequences(\n",
|
|
" tok.texts_to_sequences([ex[field] for ex in valid]),\n",
|
|
" maxlen=maxlen, padding=\"post\"\n",
|
|
" )\n",
|
|
"\n",
|
|
"X_tok = seqs(\"tokens\", tok_token, MAXLEN_SRC)\n",
|
|
"X_ner = seqs(\"ner\", tok_ner, MAXLEN_SRC)\n",
|
|
"X_srl = seqs(\"srl\", tok_srl, MAXLEN_SRC)\n",
|
|
"\n",
|
|
"Q_in = seqs(\"question_in\", tok_q, MAXLEN_TGT)\n",
|
|
"Q_out = seqs(\"question_out\", tok_q, MAXLEN_TGT)\n",
|
|
"A_in = seqs(\"answer_in\", tok_a, MAXLEN_TGT)\n",
|
|
"A_out = seqs(\"answer_out\", tok_a, MAXLEN_TGT)\n",
|
|
"\n",
|
|
"y_type = to_categorical(\n",
|
|
" np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex[\"type\"] for ex in valid])]),\n",
|
|
" num_classes=vocab_type\n",
|
|
")\n",
|
|
"\n",
|
|
"# Expand dims → (batch, seq, 1) agar cocok dgn sparse_cce\n",
|
|
"Q_out = np.expand_dims(Q_out, -1)\n",
|
|
"A_out = np.expand_dims(A_out, -1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "ad3fe7f2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"(X_tok_tr, X_tok_te,\n",
|
|
" X_ner_tr, X_ner_te,\n",
|
|
" X_srl_tr, X_srl_te,\n",
|
|
" Q_in_tr, Q_in_te,\n",
|
|
" Q_out_tr, Q_out_te,\n",
|
|
" A_in_tr, A_in_te,\n",
|
|
" A_out_tr, A_out_te,\n",
|
|
" y_type_tr,y_type_te) = train_test_split(\n",
|
|
" X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,\n",
|
|
" test_size=0.2, random_state=42\n",
|
|
" )\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "f20abfb5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "ValueError",
|
|
"evalue": "too many values to unpack (expected 3)",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m emb_srl \u001b[38;5;241m=\u001b[39m Embedding(vocab_srl, \u001b[38;5;241m16\u001b[39m, mask_zero\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)(enc_srl)\n\u001b[1;32m 9\u001b[0m enc_cat \u001b[38;5;241m=\u001b[39m Concatenate()([emb_tok, emb_ner, emb_srl])\n\u001b[0;32m---> 10\u001b[0m enc_out, state_h, state_c \u001b[38;5;241m=\u001b[39m Bidirectional(\n\u001b[1;32m 11\u001b[0m LSTM(\u001b[38;5;241m256\u001b[39m, return_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_sequences\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 12\u001b[0m )(enc_cat)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ---------- Klasifikasi tipe ----------\u001b[39;00m\n\u001b[1;32m 15\u001b[0m type_out \u001b[38;5;241m=\u001b[39m Dense(vocab_type, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msoftmax\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype_output\u001b[39m\u001b[38;5;124m\"\u001b[39m)(enc_out)\n",
|
|
"\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"enc_tok = Input(shape=(None,), name=\"enc_tok\")\n",
|
|
"enc_ner = Input(shape=(None,), name=\"enc_ner\")\n",
|
|
"enc_srl = Input(shape=(None,), name=\"enc_srl\")\n",
|
|
"\n",
|
|
"emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)\n",
|
|
"emb_ner = Embedding(vocab_ner, 16, mask_zero=True)(enc_ner)\n",
|
|
"emb_srl = Embedding(vocab_srl, 16, mask_zero=True)(enc_srl)\n",
|
|
"\n",
|
|
"enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
|
|
"enc_out, state_h, state_c = Bidirectional(\n",
|
|
" LSTM(256, return_state=True, return_sequences=False)\n",
|
|
")(enc_cat)\n",
|
|
"\n",
|
|
"# ---------- Klasifikasi tipe ----------\n",
|
|
"type_out = Dense(vocab_type, activation=\"softmax\", name=\"type_output\")(enc_out)\n",
|
|
"\n",
|
|
"# ---------- Decoder QUESTION ----------\n",
|
|
"dec_q_in = Input(shape=(None,), name=\"dec_q_in\")\n",
|
|
"dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)\n",
|
|
"dec_q_lstm = LSTM(256, return_sequences=True)\n",
|
|
"dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])\n",
|
|
"q_out = TimeDistributed(Dense(vocab_q, activation=\"softmax\"), name=\"question_output\")(dec_q_out)\n",
|
|
"\n",
|
|
"# ---------- Decoder ANSWER ----------\n",
|
|
"dec_a_in = Input(shape=(None,), name=\"dec_a_in\")\n",
|
|
"dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)\n",
|
|
"dec_a_lstm = LSTM(256, return_sequences=True)\n",
|
|
"dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])\n",
|
|
"a_out = TimeDistributed(Dense(vocab_a, activation=\"softmax\"), name=\"answer_output\")(dec_a_out)\n",
|
|
"\n",
|
|
"# ---------- Build & compile ----------\n",
|
|
"model = Model(\n",
|
|
" inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],\n",
|
|
" outputs=[q_out, a_out, type_out]\n",
|
|
")\n",
|
|
"\n",
|
|
"model.compile(\n",
|
|
" optimizer=\"adam\",\n",
|
|
" loss={\n",
|
|
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
|
|
" \"answer_output\" : \"sparse_categorical_crossentropy\",\n",
|
|
" \"type_output\" : \"categorical_crossentropy\"\n",
|
|
" },\n",
|
|
" loss_weights={\n",
|
|
" \"question_output\": 1.0,\n",
|
|
" \"answer_output\" : 1.0,\n",
|
|
" \"type_output\" : 0.3\n",
|
|
" },\n",
|
|
" metrics={\n",
|
|
" \"question_output\": \"accuracy\",\n",
|
|
" \"answer_output\" : \"accuracy\",\n",
|
|
" \"type_output\" : \"accuracy\"\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"model.summary()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c348406e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"early = EarlyStopping(patience=3, restore_best_weights=True)\n",
|
|
"\n",
|
|
"model.fit(\n",
|
|
" [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],\n",
|
|
" {\"question_output\": Q_out_tr,\n",
|
|
" \"answer_output\" : A_out_tr,\n",
|
|
" \"type_output\" : y_type_tr},\n",
|
|
" batch_size=BATCH,\n",
|
|
" epochs=EPOCHS,\n",
|
|
" validation_split=0.1,\n",
|
|
" callbacks=[early]\n",
|
|
")\n",
|
|
"\n",
|
|
"# -------------------------------------------------\n",
|
|
"# 8. Simpan model & tokenizer\n",
|
|
"# -------------------------------------------------\n",
|
|
"model.save(\"qg_multitask.keras\")\n",
|
|
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
|
|
" pickle.dump({\n",
|
|
" \"token\": tok_token,\n",
|
|
" \"ner\" : tok_ner,\n",
|
|
" \"srl\" : tok_srl,\n",
|
|
" \"q\" : tok_q,\n",
|
|
" \"a\" : tok_a,\n",
|
|
" \"type\" : tok_type\n",
|
|
" }, f)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "myenv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.16"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|