TIF_E41211115_lstm-quiz-gen.../old/QC/model_tr.ipynb

330 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "94d3889b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-05-10 14:49:40.993078: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-05-10 14:49:40.996369: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-05-10 14:49:41.002001: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-05-10 14:49:41.015917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1746863381.035097 166971 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1746863381.038978 166971 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1746863381.049265 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746863381.049288 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746863381.049289 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1746863381.049290 166971 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-05-10 14:49:41.052642: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"# -------------------------------------------------\n",
"# 0. Import & Konfigurasi\n",
"# -------------------------------------------------\n",
"import json, pickle\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from collections import Counter\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.utils import to_categorical\n",
"from tensorflow.keras.layers import (\n",
" Input, Embedding, LSTM, Bidirectional, Dense, Concatenate,\n",
" TimeDistributed\n",
")\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.callbacks import EarlyStopping\n",
"\n",
"PAD_TOKEN = \"<PAD>\"\n",
"UNK_TOKEN = \"UNK\"\n",
"START_TOKEN = \"<START>\"\n",
"END_TOKEN = \"<END>\"\n",
"MAXLEN_SRC = 100 # Panjang paragraf maksimal\n",
"MAXLEN_TGT = 40 # Panjang pertanyaan/jawaban maksimal\n",
"BATCH = 32\n",
"EPOCHS = 30"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b528b34e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Valid 325 / 325 (invalid index: [])\n"
]
}
],
"source": [
"raw = json.loads(Path(\"normalize_dataset.json\").read_text(encoding=\"utf-8\"))\n",
"\n",
"req = {\"tokens\",\"ner\",\"srl\",\"question\",\"answer\",\"type\"}\n",
"valid, bad = [], []\n",
"for i,item in enumerate(raw):\n",
" if (isinstance(item,dict) and not (req-item.keys())\n",
" and all(isinstance(item[k],list) for k in req-{\"type\"})\n",
" and isinstance(item[\"type\"],str)):\n",
" valid.append(item)\n",
" else:\n",
" bad.append(i)\n",
"\n",
"print(f\"Valid {len(valid)} / {len(raw)} (invalid index: {bad[:10]})\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b18e4617",
"metadata": {},
"outputs": [],
"source": [
"for ex in valid:\n",
" ex[\"question_in\"] = [START_TOKEN] + ex[\"question\"]\n",
" ex[\"question_out\"] = ex[\"question\"] + [END_TOKEN]\n",
"\n",
" ex[\"answer_in\"] = [START_TOKEN] + ex[\"answer\"]\n",
" ex[\"answer_out\"] = ex[\"answer\"] + [END_TOKEN]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "faa30b82",
"metadata": {},
"outputs": [],
"source": [
"tok_token = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
"tok_ner = Tokenizer(lower=False, filters=\"\")\n",
"tok_srl = Tokenizer(lower=False, filters=\"\")\n",
"tok_q = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
"tok_a = Tokenizer(oov_token=UNK_TOKEN, filters=\"\")\n",
"tok_type = Tokenizer(lower=False, filters=\"\")\n",
"\n",
"tok_token.fit_on_texts([ex[\"tokens\"] for ex in valid])\n",
"tok_ner.fit_on_texts([ex[\"ner\"] for ex in valid])\n",
"tok_srl.fit_on_texts([ex[\"srl\"] for ex in valid])\n",
"tok_q.fit_on_texts([ex[\"question_in\"]+ex[\"question_out\"] for ex in valid])\n",
"tok_a.fit_on_texts([ex[\"answer_in\"]+ex[\"answer_out\"] for ex in valid])\n",
"tok_type.fit_on_texts([ex[\"type\"] for ex in valid])\n",
"\n",
"# +1 utk padding\n",
"vocab_token = len(tok_token.word_index)+1\n",
"vocab_ner = len(tok_ner.word_index)+1\n",
"vocab_srl = len(tok_srl.word_index)+1\n",
"vocab_q = len(tok_q.word_index)+1\n",
"vocab_a = len(tok_a.word_index)+1\n",
"vocab_type = len(tok_type.word_index)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c83ce734",
"metadata": {},
"outputs": [],
"source": [
"def seqs(field, tok, maxlen):\n",
" return pad_sequences(\n",
" tok.texts_to_sequences([ex[field] for ex in valid]),\n",
" maxlen=maxlen, padding=\"post\"\n",
" )\n",
"\n",
"X_tok = seqs(\"tokens\", tok_token, MAXLEN_SRC)\n",
"X_ner = seqs(\"ner\", tok_ner, MAXLEN_SRC)\n",
"X_srl = seqs(\"srl\", tok_srl, MAXLEN_SRC)\n",
"\n",
"Q_in = seqs(\"question_in\", tok_q, MAXLEN_TGT)\n",
"Q_out = seqs(\"question_out\", tok_q, MAXLEN_TGT)\n",
"A_in = seqs(\"answer_in\", tok_a, MAXLEN_TGT)\n",
"A_out = seqs(\"answer_out\", tok_a, MAXLEN_TGT)\n",
"\n",
"y_type = to_categorical(\n",
" np.array([seq[0]-1 for seq in tok_type.texts_to_sequences([ex[\"type\"] for ex in valid])]),\n",
" num_classes=vocab_type\n",
")\n",
"\n",
"# Expand dims → (batch, seq, 1) agar cocok dgn sparse_cce\n",
"Q_out = np.expand_dims(Q_out, -1)\n",
"A_out = np.expand_dims(A_out, -1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ad3fe7f2",
"metadata": {},
"outputs": [],
"source": [
"(X_tok_tr, X_tok_te,\n",
" X_ner_tr, X_ner_te,\n",
" X_srl_tr, X_srl_te,\n",
" Q_in_tr, Q_in_te,\n",
" Q_out_tr, Q_out_te,\n",
" A_in_tr, A_in_te,\n",
" A_out_tr, A_out_te,\n",
" y_type_tr,y_type_te) = train_test_split(\n",
" X_tok, X_ner, X_srl, Q_in, Q_out, A_in, A_out, y_type,\n",
" test_size=0.2, random_state=42\n",
" )\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f20abfb5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-05-10 14:49:43.127764: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
]
},
{
"ename": "ValueError",
"evalue": "too many values to unpack (expected 3)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m emb_srl \u001b[38;5;241m=\u001b[39m Embedding(vocab_srl, \u001b[38;5;241m16\u001b[39m, mask_zero\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)(enc_srl)\n\u001b[1;32m 9\u001b[0m enc_cat \u001b[38;5;241m=\u001b[39m Concatenate()([emb_tok, emb_ner, emb_srl])\n\u001b[0;32m---> 10\u001b[0m enc_out, state_h, state_c \u001b[38;5;241m=\u001b[39m Bidirectional(\n\u001b[1;32m 11\u001b[0m LSTM(\u001b[38;5;241m256\u001b[39m, return_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_sequences\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 12\u001b[0m )(enc_cat)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ---------- Klasifikasi tipe ----------\u001b[39;00m\n\u001b[1;32m 15\u001b[0m type_out \u001b[38;5;241m=\u001b[39m Dense(vocab_type, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msoftmax\u001b[39m\u001b[38;5;124m\"\u001b[39m, name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype_output\u001b[39m\u001b[38;5;124m\"\u001b[39m)(enc_out)\n",
"\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 3)"
]
}
],
"source": [
"enc_tok = Input(shape=(None,), name=\"enc_tok\")\n",
"enc_ner = Input(shape=(None,), name=\"enc_ner\")\n",
"enc_srl = Input(shape=(None,), name=\"enc_srl\")\n",
"\n",
"emb_tok = Embedding(vocab_token, 128, mask_zero=True)(enc_tok)\n",
"emb_ner = Embedding(vocab_ner, 16, mask_zero=True)(enc_ner)\n",
"emb_srl = Embedding(vocab_srl, 16, mask_zero=True)(enc_srl)\n",
"\n",
"enc_cat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
"enc_out, state_h, state_c = Bidirectional(\n",
" LSTM(256, return_state=True, return_sequences=False)\n",
")(enc_cat)\n",
"\n",
"# ---------- Klasifikasi tipe ----------\n",
"type_out = Dense(vocab_type, activation=\"softmax\", name=\"type_output\")(enc_out)\n",
"\n",
"# ---------- Decoder QUESTION ----------\n",
"dec_q_in = Input(shape=(None,), name=\"dec_q_in\")\n",
"dec_q_emb = Embedding(vocab_q, 128, mask_zero=True)(dec_q_in)\n",
"dec_q_lstm = LSTM(256, return_sequences=True)\n",
"dec_q_out = dec_q_lstm(dec_q_emb, initial_state=[state_h, state_c])\n",
"q_out = TimeDistributed(Dense(vocab_q, activation=\"softmax\"), name=\"question_output\")(dec_q_out)\n",
"\n",
"# ---------- Decoder ANSWER ----------\n",
"dec_a_in = Input(shape=(None,), name=\"dec_a_in\")\n",
"dec_a_emb = Embedding(vocab_a, 128, mask_zero=True)(dec_a_in)\n",
"dec_a_lstm = LSTM(256, return_sequences=True)\n",
"dec_a_out = dec_a_lstm(dec_a_emb, initial_state=[state_h, state_c])\n",
"a_out = TimeDistributed(Dense(vocab_a, activation=\"softmax\"), name=\"answer_output\")(dec_a_out)\n",
"\n",
"# ---------- Build & compile ----------\n",
"model = Model(\n",
" inputs=[enc_tok, enc_ner, enc_srl, dec_q_in, dec_a_in],\n",
" outputs=[q_out, a_out, type_out]\n",
")\n",
"\n",
"model.compile(\n",
" optimizer=\"adam\",\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\" : \"sparse_categorical_crossentropy\",\n",
" \"type_output\" : \"categorical_crossentropy\"\n",
" },\n",
" loss_weights={\n",
" \"question_output\": 1.0,\n",
" \"answer_output\" : 1.0,\n",
" \"type_output\" : 0.3\n",
" },\n",
" metrics={\n",
" \"question_output\": \"accuracy\",\n",
" \"answer_output\" : \"accuracy\",\n",
" \"type_output\" : \"accuracy\"\n",
" }\n",
")\n",
"\n",
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c348406e",
"metadata": {},
"outputs": [],
"source": [
"early = EarlyStopping(patience=3, restore_best_weights=True)\n",
"\n",
"model.fit(\n",
" [X_tok_tr, X_ner_tr, X_srl_tr, Q_in_tr, A_in_tr],\n",
" {\"question_output\": Q_out_tr,\n",
" \"answer_output\" : A_out_tr,\n",
" \"type_output\" : y_type_tr},\n",
" batch_size=BATCH,\n",
" epochs=EPOCHS,\n",
" validation_split=0.1,\n",
" callbacks=[early]\n",
")\n",
"\n",
"# -------------------------------------------------\n",
"# 8. Simpan model & tokenizer\n",
"# -------------------------------------------------\n",
"model.save(\"qg_multitask.keras\")\n",
"with open(\"tokenizers.pkl\", \"wb\") as f:\n",
" pickle.dump({\n",
" \"token\": tok_token,\n",
" \"ner\" : tok_ner,\n",
" \"srl\" : tok_srl,\n",
" \"q\" : tok_q,\n",
" \"a\" : tok_a,\n",
" \"type\" : tok_type\n",
" }, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}