feat: lstm format for generate quiz

This commit is contained in:
akhdanre 2025-02-05 02:45:10 +07:00
commit fb6ff8512b
9 changed files with 656 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.venv

View File

@ -0,0 +1 @@
,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;

447
lstm.ipynb Normal file
View File

@ -0,0 +1,447 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-02-05 01:57:25.675154: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional\"</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1mModel: \"functional\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃<span style=\"font-weight: bold\"> Connected to </span>┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ encoder_inputs │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_inputs │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ - │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,280</span> │ encoder_inputs[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ not_equal │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ encoder_inputs[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_embedding │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │ decoder_inputs[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ encoder_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ embedding[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>], │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), │ │ not_equal[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
"│ │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, │ <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ decoder_embeddin… │\n",
"│ │ <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, │ │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ │ <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, │ │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ │ <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_dense │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">8</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,056</span> │ decoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
"│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ │ │ │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ encoder_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,280\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ not_equal │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n",
"│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │ decoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n",
"│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m394,240\u001b[0m │ embedding[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, │ \u001b[38;5;34m394,240\u001b[0m │ decoder_embeddin… │\n",
"│ │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m, │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m, │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ │ \u001b[38;5;34m256\u001b[0m)] │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ decoder_dense │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m2,056\u001b[0m │ decoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">792,840</span> (3.02 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">792,840</span> (3.02 MB)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n",
"Epoch 1/10\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-02-05 01:57:27.530017: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n",
"2025-02-05 01:57:27.593630: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence\n",
"\t [[{{node IteratorGetNext}}]]\n",
"/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/trainers/epoch_iterator.py:151: UserWarning: Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least `steps_per_epoch * epochs` batches. You may need to use the `.repeat()` function when building your dataset.\n",
" self._interrupted_warning()\n"
]
},
{
"ename": "ValueError",
"evalue": "math domain error",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m 113\u001b[0m target_val \u001b[38;5;241m=\u001b[39m decoder_target_data[split_index:]\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# 6) Fit the Model\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_train\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 121\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 122\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_val\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_val\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;66;03m# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\u001b[39;00m\n\u001b[1;32m 127\u001b[0m \n\u001b[1;32m 128\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;66;03m# 7) Evaluate the Model\u001b[39;00m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;66;03m# If you want a quick evaluation on the validation set:\u001b[39;00m\n\u001b[1;32m 132\u001b[0m val_loss, val_accuracy \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mevaluate([encoder_val, decoder_val], target_val)\n",
"File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 119\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
"File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/progbar.py:119\u001b[0m, in \u001b[0;36mProgbar.update\u001b[0;34m(self, current, values, finalize)\u001b[0m\n\u001b[1;32m 116\u001b[0m message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 119\u001b[0m numdigits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(\u001b[43mmath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog10\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 120\u001b[0m bar \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(numdigits) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124md/\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m%\u001b[39m (current, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget)\n\u001b[1;32m 121\u001b[0m bar \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[1m\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[0m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
"\u001b[0;31mValueError\u001b[0m: math domain error"
]
}
],
"source": [
"# ==========================\n",
"# 1) Install/Import Dependencies\n",
"# ==========================\n",
"# If you are in a brand new environment, uncomment the following line:\n",
"# %pip install tensorflow pandas\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.layers import Input, LSTM, Embedding, Dense\n",
"from tensorflow.keras.models import Model\n",
"\n",
"# ==========================\n",
"# 2) Load Dataset (CSV)\n",
"# ==========================\n",
"# Adjust the file path to your CSV file\n",
"df = pd.read_csv(\"quiz_questions.csv\")\n",
"\n",
"# Extract the paragraphs and questions\n",
"paragraphs = df['paragraph'].astype(str).tolist()\n",
"questions = df['question'].astype(str).tolist()\n",
"\n",
"# (Optional) For demonstration, let's ignore question_type, answer, distractors in this example\n",
"# but you can incorporate them as extra signals if you wish.\n",
"\n",
"# ==========================\n",
"# 3) Tokenize Text\n",
"# ==========================\n",
"# Create two tokenizers: one for paragraphs, one for questions\n",
"num_words = 10000 # Maximum vocabulary size\n",
"\n",
"tokenizer_paragraph = Tokenizer(num_words=num_words, oov_token=\"<OOV>\")\n",
"tokenizer_paragraph.fit_on_texts(paragraphs)\n",
"paragraph_sequences = tokenizer_paragraph.texts_to_sequences(paragraphs)\n",
"\n",
"tokenizer_question = Tokenizer(num_words=num_words, oov_token=\"<OOV>\")\n",
"tokenizer_question.fit_on_texts(questions)\n",
"question_sequences = tokenizer_question.texts_to_sequences(questions)\n",
"\n",
"# Get max lengths (for padding)\n",
"max_paragraph_len = max(len(seq) for seq in paragraph_sequences)\n",
"max_question_len = max(len(seq) for seq in question_sequences)\n",
"\n",
"# Pad sequences\n",
"encoder_input_data = pad_sequences(paragraph_sequences, maxlen=max_paragraph_len, padding='post')\n",
"# For decoder data, we usually do teacher forcing:\n",
"# We'll keep one version as input, one version shifted as the target\n",
"decoder_input_data_full = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')\n",
"\n",
"# We create decoder_target_data by shifting to the left by 1 token\n",
"decoder_target_data = np.copy(decoder_input_data_full[:, 1:])\n",
"decoder_input_data = np.copy(decoder_input_data_full[:, :-1])\n",
"\n",
"# Expand target dimension for sparse_categorical_crossentropy\n",
"decoder_target_data = np.expand_dims(decoder_target_data, -1)\n",
"\n",
"# Calculate vocab sizes\n",
"vocab_size_paragraph = min(len(tokenizer_paragraph.word_index) + 1, num_words)\n",
"vocab_size_question = min(len(tokenizer_question.word_index) + 1, num_words)\n",
"\n",
"# ==========================\n",
"# 4) Build Seq2Seq Model\n",
"# ==========================\n",
"embedding_dim = 128\n",
"latent_dim = 256 # LSTM hidden dimension\n",
"\n",
"# ----- Encoder -----\n",
"encoder_inputs = Input(shape=(None,), name=\"encoder_inputs\")\n",
"encoder_embedding = Embedding(input_dim=vocab_size_paragraph,\n",
" output_dim=embedding_dim,\n",
" mask_zero=True)(encoder_inputs)\n",
"\n",
"encoder_lstm = LSTM(latent_dim, return_state=True, name=\"encoder_lstm\")\n",
"_, state_h, state_c = encoder_lstm(encoder_embedding)\n",
"\n",
"encoder_states = [state_h, state_c]\n",
"\n",
"# ----- Decoder -----\n",
"decoder_inputs = Input(shape=(None,), name=\"decoder_inputs\")\n",
"decoder_embedding = Embedding(input_dim=vocab_size_question,\n",
" output_dim=embedding_dim,\n",
" mask_zero=True,\n",
" name=\"decoder_embedding\")(decoder_inputs)\n",
"\n",
"decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name=\"decoder_lstm\")\n",
"decoder_outputs, _, _ = decoder_lstm(decoder_embedding,\n",
" initial_state=encoder_states)\n",
"decoder_dense = Dense(vocab_size_question, activation='softmax', name=\"decoder_dense\")\n",
"decoder_outputs = decoder_dense(decoder_outputs)\n",
"\n",
"# Combine into a training model\n",
"model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
"model.compile(optimizer='adam',\n",
" loss='sparse_categorical_crossentropy',\n",
" metrics=['sparse_categorical_accuracy'])\n",
"\n",
"print(model.summary())\n",
"\n",
"# ==========================\n",
"# 5) Train/Test Split (Optional)\n",
"# ==========================\n",
"# For simplicity, let's do a quick train/validation split\n",
"# Adjust split size or do a separate test set for production usage.\n",
"split_index = int(0.8 * len(encoder_input_data))\n",
"encoder_train = encoder_input_data[:split_index]\n",
"decoder_train = decoder_input_data[:split_index]\n",
"target_train = decoder_target_data[:split_index]\n",
"\n",
"encoder_val = encoder_input_data[split_index:]\n",
"decoder_val = decoder_input_data[split_index:]\n",
"target_val = decoder_target_data[split_index:]\n",
"\n",
"# ==========================\n",
"# 6) Fit the Model\n",
"# ==========================\n",
"history = model.fit(\n",
" [encoder_train, decoder_train],\n",
" target_train,\n",
" batch_size=32,\n",
" epochs=10,\n",
" validation_data=([encoder_val, decoder_val], target_val)\n",
")\n",
"\n",
"# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\n",
"\n",
"# ==========================\n",
"# 7) Evaluate the Model\n",
"# ==========================\n",
"# If you want a quick evaluation on the validation set:\n",
"val_loss, val_accuracy = model.evaluate([encoder_val, decoder_val], target_val)\n",
"print(f\"Validation Loss: {val_loss:.4f}\")\n",
"print(f\"Validation Accuracy (token-level): {val_accuracy:.4f}\")\n",
"\n",
"# ==========================\n",
"# 8) Build Inference Models\n",
"# ==========================\n",
"# Encoder model for inference\n",
"encoder_model_inf = Model(encoder_inputs, encoder_states)\n",
"\n",
"# Decoder model for inference\n",
"decoder_state_input_h = Input(shape=(latent_dim,), name=\"inference_state_h\")\n",
"decoder_state_input_c = Input(shape=(latent_dim,), name=\"inference_state_c\")\n",
"decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n",
"\n",
"dec_emb_inf = decoder_embedding(decoder_inputs)\n",
"decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(\n",
" dec_emb_inf, initial_state=decoder_states_inputs\n",
")\n",
"decoder_inf_states = [state_h_inf, state_c_inf]\n",
"decoder_inf_outputs = decoder_dense(decoder_inf_outputs)\n",
"\n",
"decoder_model_inf = Model(\n",
" [decoder_inputs] + decoder_states_inputs,\n",
" [decoder_inf_outputs] + decoder_inf_states\n",
")\n",
"\n",
"# Create index-to-word mapping for the question tokenizer\n",
"index_to_word_question = {idx: word for word, idx in tokenizer_question.word_index.items()}\n",
"# If you used an OOV token, might want to handle that as well.\n",
"\n",
"def generate_question(paragraph_text, max_length=50, start_token=None, end_token=None):\n",
" \"\"\"\n",
" Generate a question from a paragraph using the trained seq2seq model.\n",
" Token-level decoding with greedy search.\n",
" \"\"\"\n",
" # 1) Encode the paragraph\n",
" seq = tokenizer_paragraph.texts_to_sequences([paragraph_text])\n",
" seq = pad_sequences(seq, maxlen=max_paragraph_len, padding='post')\n",
" states_value = encoder_model_inf.predict(seq)\n",
"\n",
" # 2) Start token\n",
" target_seq = np.zeros((1, 1), dtype='int32')\n",
" # If you have a <START> token, set it here\n",
" # e.g., target_seq[0, 0] = tokenizer_question.word_index[\"<start>\"]\n",
"\n",
" decoded_words = []\n",
"\n",
" for _ in range(max_length):\n",
" output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)\n",
"\n",
" sampled_token_index = np.argmax(output_tokens[0, -1, :])\n",
" sampled_word = index_to_word_question.get(sampled_token_index, '<UNK>')\n",
"\n",
" # Stop if we encounter an <end> token or a special index\n",
" if end_token and (sampled_word == end_token):\n",
" break\n",
"\n",
" decoded_words.append(sampled_word)\n",
"\n",
" # Next target\n",
" target_seq = np.zeros((1, 1), dtype='int32')\n",
" target_seq[0, 0] = sampled_token_index\n",
"\n",
" states_value = [h, c]\n",
"\n",
" return ' '.join(decoded_words)\n",
"\n",
"# ==========================\n",
"# 9) Test Inference on a Paragraph\n",
"# ==========================\n",
"test_paragraph = \"Albert Einstein was a theoretical physicist born in Germany...\"\n",
"generated = generate_question(test_paragraph)\n",
"print(\"Generated question:\", generated)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"yups 0\n",
"yups 1\n",
"yups 2\n",
"yups 3\n",
"yups 4\n",
"yups 5\n",
"yups 6\n",
"yups 7\n",
"yups 8\n",
"yups 9\n",
"yups 10\n",
"yups 11\n",
"yups 12\n",
"yups 13\n",
"yups 14\n",
"yups 15\n",
"yups 16\n",
"yups 17\n",
"yups 18\n",
"yups 19\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

123
main.py Normal file
View File

@ -0,0 +1,123 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
LSTM,
Embedding,
Dense,
SpatialDropout1D,
TimeDistributed,
)
from tensorflow.keras.optimizers import Adam
# -----------------------
# 1. Load dataset
# -----------------------
df = pd.read_csv("quiz_questions.csv")
# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
df.dropna(subset=["paragraph", "question"], inplace=True)
# -----------------------
# 2. Preprocessing text
# -----------------------
def preprocess_text(text):
# Contoh preprocessing sederhana
text = text.lower()
return text
df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
df["question"] = df["question"].astype(str).apply(preprocess_text)
# -----------------------
# 3. Tokenization
# -----------------------
# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
vocab_size = len(tokenizer.word_index) + 1 # +1 karena index dimulai dari 1
# Konversi teks menjadi sequences
X_sequences = tokenizer.texts_to_sequences(df["paragraph"])
y_sequences = tokenizer.texts_to_sequences(df["question"])
# Cari panjang sequence maksimal (agar uniform untuk padding)
max_len_paragraph = max(len(seq) for seq in X_sequences)
max_len_question = max(len(seq) for seq in y_sequences)
max_length = max(max_len_paragraph, max_len_question)
# Padding sequences (panjangnya disamakan => max_length)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post")
y_padded = pad_sequences(y_sequences, maxlen=max_length, padding="post")
with open("tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)
print("Tokenizer disimpan ke tokenizer.pkl")
# -----------------------
# 4. Siapkan X, y
# -----------------------
# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
# idealnya y memiliki shape: (num_samples, max_length, 1)
X = np.array(X_padded)
y = np.expand_dims(np.array(y_padded), axis=-1)
print("Shape X:", X.shape)
print("Shape y:", y.shape) # (batch_size, max_length, 1)
# -----------------------
# 5. Split data
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print("Train size:", X_train.shape, y_train.shape)
print("Test size: ", X_test.shape, y_test.shape)
# -----------------------
# 6. Build Model LSTM
# -----------------------
# Kita pakai 2 LSTM stack, masing2 return_sequences=True
# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
# TimeDistributed Dense agar Dense diaplikasikan per timestep
model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
# -----------------------
# 7. Compile
# -----------------------
model.compile(
loss="sparse_categorical_crossentropy",
optimizer=Adam(learning_rate=0.001),
metrics=["accuracy"],
)
model.summary()
# -----------------------
# 8. Train Model
# -----------------------
epochs = 10
history = model.fit(
X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
)
# -----------------------
# 9. Save Model
# -----------------------
model.save("lstm_question_generator.keras")
print("Training selesai dan model telah disimpan.")

11
quiz_questions.csv Normal file
View File

@ -0,0 +1,11 @@
paragraph,question_type,question,answer,options
Albert Einstein adalah ilmuwan yang menemukan teori relativitas.,fill_in_the_blank,Siapakah ilmuwan yang menemukan teori relativitas?,Albert Einstein,-
Isaac Newton menemukan hukum gravitasi setelah melihat apel jatuh dari pohon.,fill_in_the_blank,Siapakah ilmuwan yang menemukan hukum gravitasi?,Isaac Newton,-
Bumi mengorbit mengelilingi Matahari dalam waktu sekitar 365 hari.,multiple_choice,Berapa lama Bumi mengorbit Matahari?,365 hari,360 hari|365 hari|366 hari|370 hari
Proklamasi Kemerdekaan Indonesia terjadi pada tanggal 17 Agustus 1945.,fill_in_the_blank,Kapan Proklamasi Kemerdekaan Indonesia terjadi?,17 Agustus 1945,-
Hewan yang dapat hidup di dua alam disebut amfibi.,true_false,Hewan yang dapat hidup di dua alam disebut reptil.,False,True|False
Gunung tertinggi di dunia adalah Gunung Everest.,fill_in_the_blank,Gunung apakah yang tertinggi di dunia?,Gunung Everest,-
Lapisan terluar dari atmosfer Bumi disebut troposfer.,multiple_choice,Apa nama lapisan terluar atmosfer Bumi?,Eksosfer,Troposfer|Stratosfer|Mesosfer|Eksosfer
Ibu Kota Jepang adalah Tokyo.,true_false,Ibu Kota Jepang adalah Kyoto.,False,True|False
Fotosintesis adalah proses yang dilakukan tumbuhan untuk membuat makanan sendiri.,fill_in_the_blank,Apa nama proses yang dilakukan tumbuhan untuk membuat makanannya sendiri?,Fotosintesis,-
Dataran tertinggi di dunia adalah Dataran Tinggi Tibet.,fill_in_the_blank,Apa nama dataran tertinggi di dunia?,Dataran Tinggi Tibet,-
1 paragraph question_type question answer options
2 Albert Einstein adalah ilmuwan yang menemukan teori relativitas. fill_in_the_blank Siapakah ilmuwan yang menemukan teori relativitas? Albert Einstein -
3 Isaac Newton menemukan hukum gravitasi setelah melihat apel jatuh dari pohon. fill_in_the_blank Siapakah ilmuwan yang menemukan hukum gravitasi? Isaac Newton -
4 Bumi mengorbit mengelilingi Matahari dalam waktu sekitar 365 hari. multiple_choice Berapa lama Bumi mengorbit Matahari? 365 hari 360 hari|365 hari|366 hari|370 hari
5 Proklamasi Kemerdekaan Indonesia terjadi pada tanggal 17 Agustus 1945. fill_in_the_blank Kapan Proklamasi Kemerdekaan Indonesia terjadi? 17 Agustus 1945 -
6 Hewan yang dapat hidup di dua alam disebut amfibi. true_false Hewan yang dapat hidup di dua alam disebut reptil. False True|False
7 Gunung tertinggi di dunia adalah Gunung Everest. fill_in_the_blank Gunung apakah yang tertinggi di dunia? Gunung Everest -
8 Lapisan terluar dari atmosfer Bumi disebut troposfer. multiple_choice Apa nama lapisan terluar atmosfer Bumi? Eksosfer Troposfer|Stratosfer|Mesosfer|Eksosfer
9 Ibu Kota Jepang adalah Tokyo. true_false Ibu Kota Jepang adalah Kyoto. False True|False
10 Fotosintesis adalah proses yang dilakukan tumbuhan untuk membuat makanan sendiri. fill_in_the_blank Apa nama proses yang dilakukan tumbuhan untuk membuat makanannya sendiri? Fotosintesis -
11 Dataran tertinggi di dunia adalah Dataran Tinggi Tibet. fill_in_the_blank Apa nama dataran tertinggi di dunia? Dataran Tinggi Tibet -

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
numpy
pandas
matplotlib
scikit-learn
tensorflow

68
test.py Normal file
View File

@ -0,0 +1,68 @@
import numpy as np
import re
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
# Misal kita punya tokenizer, model, dan max_length:
# tokenizer, model, max_length = ...
# Pastikan Anda load model & tokenizer sesuai environment Anda.
def preprocess_text(text):
# Fungsi preprocess sederhana (harus sama atau mirip dengan training)
text = text.lower()
# Buat penyesuaian lain jika perlu
return text
def generate_question(paragraph, tokenizer, model, max_length):
# 1) Preprocess paragraph
paragraph = preprocess_text(paragraph)
# 2) Tokenize
seq = tokenizer.texts_to_sequences([paragraph]) # hasilnya list of list
# 3) Pad sequence
padded = pad_sequences(seq, maxlen=max_length, padding="post")
# 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size)
prediction = model.predict(padded) # (1, max_length, vocab_size)
# 5) Cari argmax di setiap time step => (1, max_length)
predicted_indices = np.argmax(prediction, axis=-1)[0]
# 6) Konversi ke kata
predicted_words = []
for idx in predicted_indices:
# Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer
if idx == 0:
# Boleh langsung break, karena sisanya kemungkinan pad
break
word = tokenizer.index_word.get(idx, "")
predicted_words.append(word)
# 7) Gabungkan jadi satu kalimat
predicted_question = " ".join(predicted_words)
# Bisa saja kita menambahkan tanda tanya
if not predicted_question.endswith("?"):
predicted_question = predicted_question + "?"
return predicted_question
model = load_model("lstm_question_generator.keras")
with open("tokenizer.pkl", "rb") as f:
tokenizer = pickle.load(f)
# Pastikan max_length sama seperti saat training
max_length = 50 # Atau nilai yang Anda tetapkan
paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum."
generated_q = generate_question(paragraph_input, tokenizer, model, max_length)
print("Generated question:", generated_q)

BIN
tokenizer.pkl Normal file

Binary file not shown.