feat: lstm format for generate quiz

2025-02-05 02:45:10 +07:00 · 2025-02-05 02:45:10 +07:00 · fb6ff8512b
commit fb6ff8512b
9 changed files with 656 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.venv
--- a/.~lock.quiz_questions.csv#
+++ b/.~lock.quiz_questions.csv#
@ -0,0 +1 @@
+,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;
--- a/lstm.ipynb
+++ b/lstm.ipynb
@ -0,0 +1,447 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-02-05 01:57:25.675154: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional\"</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1mModel: \"functional\"\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Layer (type)        </span>┃<span style=\"font-weight: bold\"> Output Shape      </span>┃<span style=\"font-weight: bold\">    Param # </span>┃<span style=\"font-weight: bold\"> Connected to      </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ encoder_inputs      │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>)      │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>)        │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_inputs      │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>)      │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>)        │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ embedding           │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │      <span style=\"color: #00af00; text-decoration-color: #00af00\">1,280</span> │ encoder_inputs[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ not_equal           │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>)      │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ encoder_inputs[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>)          │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_embedding   │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │      <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │ decoder_inputs[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ encoder_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),     │    <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ embedding[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>],  │\n",
+       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>),      │            │ not_equal[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]   │\n",
+       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)]      │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>,     │    <span style=\"color: #00af00; text-decoration-color: #00af00\">394,240</span> │ decoder_embeddin… │\n",
+       "│                     │ <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>,      │            │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
+       "│                     │ <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>), (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>,      │            │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
+       "│                     │ <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)]             │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_dense       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">8</span>)   │      <span style=\"color: #00af00; text-decoration-color: #00af00\">2,056</span> │ decoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)             │                   │            │                   │\n",
+       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)       \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape     \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m   Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to     \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ encoder_inputs      │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m)      │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
+       "│ (\u001b[38;5;33mInputLayer\u001b[0m)        │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_inputs      │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m)      │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
+       "│ (\u001b[38;5;33mInputLayer\u001b[0m)        │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ embedding           │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │      \u001b[38;5;34m1,280\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n",
+       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ not_equal           │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m)      │          \u001b[38;5;34m0\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n",
+       "│ (\u001b[38;5;33mNotEqual\u001b[0m)          │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_embedding   │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │      \u001b[38;5;34m1,024\u001b[0m │ decoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n",
+       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m),     │    \u001b[38;5;34m394,240\u001b[0m │ embedding[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m],  │\n",
+       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m),      │            │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]   │\n",
+       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)]      │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m,     │    \u001b[38;5;34m394,240\u001b[0m │ decoder_embeddin… │\n",
+       "│                     │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m,      │            │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
+       "│                     │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m,      │            │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
+       "│                     │ \u001b[38;5;34m256\u001b[0m)]             │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_dense       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m8\u001b[0m)   │      \u001b[38;5;34m2,056\u001b[0m │ decoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
+       "│ (\u001b[38;5;33mDense\u001b[0m)             │                   │            │                   │\n",
+       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">792,840</span> (3.02 MB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">792,840</span> (3.02 MB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n",
+      "Epoch 1/10\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-02-05 01:57:27.530017: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n",
+      "2025-02-05 01:57:27.593630: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence\n",
+      "\t [[{{node IteratorGetNext}}]]\n",
+      "/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/trainers/epoch_iterator.py:151: UserWarning: Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least `steps_per_epoch * epochs` batches. You may need to use the `.repeat()` function when building your dataset.\n",
+      "  self._interrupted_warning()\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "math domain error",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m    113\u001b[0m target_val  \u001b[38;5;241m=\u001b[39m decoder_target_data[split_index:]\n\u001b[1;32m    115\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m    116\u001b[0m \u001b[38;5;66;03m# 6) Fit the Model\u001b[39;00m\n\u001b[1;32m    117\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    119\u001b[0m \u001b[43m    \u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_train\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    120\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtarget_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    121\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    122\u001b[0m \u001b[43m    \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    123\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_val\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_val\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    124\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m    126\u001b[0m \u001b[38;5;66;03m# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\u001b[39;00m\n\u001b[1;32m    127\u001b[0m \n\u001b[1;32m    128\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m    129\u001b[0m \u001b[38;5;66;03m# 7) Evaluate the Model\u001b[39;00m\n\u001b[1;32m    130\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m    131\u001b[0m \u001b[38;5;66;03m# If you want a quick evaluation on the validation set:\u001b[39;00m\n\u001b[1;32m    132\u001b[0m val_loss, val_accuracy \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mevaluate([encoder_val, decoder_val], target_val)\n",
+      "File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    119\u001b[0m     filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m    121\u001b[0m     \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    124\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
+      "File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/progbar.py:119\u001b[0m, in \u001b[0;36mProgbar.update\u001b[0;34m(self, current, values, finalize)\u001b[0m\n\u001b[1;32m    116\u001b[0m     message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 119\u001b[0m     numdigits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(\u001b[43mmath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog10\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    120\u001b[0m     bar \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(numdigits) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124md/\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m%\u001b[39m (current, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget)\n\u001b[1;32m    121\u001b[0m     bar \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[1m\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[0m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "\u001b[0;31mValueError\u001b[0m: math domain error"
+     ]
+    }
+   ],
+   "source": [
+    "# ==========================\n",
+    "# 1) Install/Import Dependencies\n",
+    "# ==========================\n",
+    "# If you are in a brand new environment, uncomment the following line:\n",
+    "# %pip install tensorflow pandas\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import tensorflow as tf\n",
+    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+    "from tensorflow.keras.layers import Input, LSTM, Embedding, Dense\n",
+    "from tensorflow.keras.models import Model\n",
+    "\n",
+    "# ==========================\n",
+    "# 2) Load Dataset (CSV)\n",
+    "# ==========================\n",
+    "# Adjust the file path to your CSV file\n",
+    "df = pd.read_csv(\"quiz_questions.csv\")\n",
+    "\n",
+    "# Extract the paragraphs and questions\n",
+    "paragraphs = df['paragraph'].astype(str).tolist()\n",
+    "questions  = df['question'].astype(str).tolist()\n",
+    "\n",
+    "# (Optional) For demonstration, let's ignore question_type, answer, distractors in this example\n",
+    "# but you can incorporate them as extra signals if you wish.\n",
+    "\n",
+    "# ==========================\n",
+    "# 3) Tokenize Text\n",
+    "# ==========================\n",
+    "# Create two tokenizers: one for paragraphs, one for questions\n",
+    "num_words = 10000  # Maximum vocabulary size\n",
+    "\n",
+    "tokenizer_paragraph = Tokenizer(num_words=num_words, oov_token=\"<OOV>\")\n",
+    "tokenizer_paragraph.fit_on_texts(paragraphs)\n",
+    "paragraph_sequences = tokenizer_paragraph.texts_to_sequences(paragraphs)\n",
+    "\n",
+    "tokenizer_question = Tokenizer(num_words=num_words, oov_token=\"<OOV>\")\n",
+    "tokenizer_question.fit_on_texts(questions)\n",
+    "question_sequences = tokenizer_question.texts_to_sequences(questions)\n",
+    "\n",
+    "# Get max lengths (for padding)\n",
+    "max_paragraph_len = max(len(seq) for seq in paragraph_sequences)\n",
+    "max_question_len  = max(len(seq) for seq in question_sequences)\n",
+    "\n",
+    "# Pad sequences\n",
+    "encoder_input_data = pad_sequences(paragraph_sequences, maxlen=max_paragraph_len, padding='post')\n",
+    "# For decoder data, we usually do teacher forcing:\n",
+    "# We'll keep one version as input, one version shifted as the target\n",
+    "decoder_input_data_full = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')\n",
+    "\n",
+    "# We create decoder_target_data by shifting to the left by 1 token\n",
+    "decoder_target_data = np.copy(decoder_input_data_full[:, 1:])\n",
+    "decoder_input_data  = np.copy(decoder_input_data_full[:, :-1])\n",
+    "\n",
+    "# Expand target dimension for sparse_categorical_crossentropy\n",
+    "decoder_target_data = np.expand_dims(decoder_target_data, -1)\n",
+    "\n",
+    "# Calculate vocab sizes\n",
+    "vocab_size_paragraph = min(len(tokenizer_paragraph.word_index) + 1, num_words)\n",
+    "vocab_size_question  = min(len(tokenizer_question.word_index)  + 1, num_words)\n",
+    "\n",
+    "# ==========================\n",
+    "# 4) Build Seq2Seq Model\n",
+    "# ==========================\n",
+    "embedding_dim = 128\n",
+    "latent_dim    = 256  # LSTM hidden dimension\n",
+    "\n",
+    "# ----- Encoder -----\n",
+    "encoder_inputs = Input(shape=(None,), name=\"encoder_inputs\")\n",
+    "encoder_embedding = Embedding(input_dim=vocab_size_paragraph,\n",
+    "                              output_dim=embedding_dim,\n",
+    "                              mask_zero=True)(encoder_inputs)\n",
+    "\n",
+    "encoder_lstm = LSTM(latent_dim, return_state=True, name=\"encoder_lstm\")\n",
+    "_, state_h, state_c = encoder_lstm(encoder_embedding)\n",
+    "\n",
+    "encoder_states = [state_h, state_c]\n",
+    "\n",
+    "# ----- Decoder -----\n",
+    "decoder_inputs = Input(shape=(None,), name=\"decoder_inputs\")\n",
+    "decoder_embedding = Embedding(input_dim=vocab_size_question,\n",
+    "                              output_dim=embedding_dim,\n",
+    "                              mask_zero=True,\n",
+    "                              name=\"decoder_embedding\")(decoder_inputs)\n",
+    "\n",
+    "decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name=\"decoder_lstm\")\n",
+    "decoder_outputs, _, _ = decoder_lstm(decoder_embedding,\n",
+    "                                     initial_state=encoder_states)\n",
+    "decoder_dense = Dense(vocab_size_question, activation='softmax', name=\"decoder_dense\")\n",
+    "decoder_outputs = decoder_dense(decoder_outputs)\n",
+    "\n",
+    "# Combine into a training model\n",
+    "model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n",
+    "model.compile(optimizer='adam',\n",
+    "              loss='sparse_categorical_crossentropy',\n",
+    "              metrics=['sparse_categorical_accuracy'])\n",
+    "\n",
+    "print(model.summary())\n",
+    "\n",
+    "# ==========================\n",
+    "# 5) Train/Test Split (Optional)\n",
+    "# ==========================\n",
+    "# For simplicity, let's do a quick train/validation split\n",
+    "# Adjust split size or do a separate test set for production usage.\n",
+    "split_index = int(0.8 * len(encoder_input_data))\n",
+    "encoder_train = encoder_input_data[:split_index]\n",
+    "decoder_train = decoder_input_data[:split_index]\n",
+    "target_train  = decoder_target_data[:split_index]\n",
+    "\n",
+    "encoder_val = encoder_input_data[split_index:]\n",
+    "decoder_val = decoder_input_data[split_index:]\n",
+    "target_val  = decoder_target_data[split_index:]\n",
+    "\n",
+    "# ==========================\n",
+    "# 6) Fit the Model\n",
+    "# ==========================\n",
+    "history = model.fit(\n",
+    "    [encoder_train, decoder_train],\n",
+    "    target_train,\n",
+    "    batch_size=32,\n",
+    "    epochs=10,\n",
+    "    validation_data=([encoder_val, decoder_val], target_val)\n",
+    ")\n",
+    "\n",
+    "# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\n",
+    "\n",
+    "# ==========================\n",
+    "# 7) Evaluate the Model\n",
+    "# ==========================\n",
+    "# If you want a quick evaluation on the validation set:\n",
+    "val_loss, val_accuracy = model.evaluate([encoder_val, decoder_val], target_val)\n",
+    "print(f\"Validation Loss: {val_loss:.4f}\")\n",
+    "print(f\"Validation Accuracy (token-level): {val_accuracy:.4f}\")\n",
+    "\n",
+    "# ==========================\n",
+    "# 8) Build Inference Models\n",
+    "# ==========================\n",
+    "# Encoder model for inference\n",
+    "encoder_model_inf = Model(encoder_inputs, encoder_states)\n",
+    "\n",
+    "# Decoder model for inference\n",
+    "decoder_state_input_h = Input(shape=(latent_dim,), name=\"inference_state_h\")\n",
+    "decoder_state_input_c = Input(shape=(latent_dim,), name=\"inference_state_c\")\n",
+    "decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n",
+    "\n",
+    "dec_emb_inf = decoder_embedding(decoder_inputs)\n",
+    "decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(\n",
+    "    dec_emb_inf, initial_state=decoder_states_inputs\n",
+    ")\n",
+    "decoder_inf_states = [state_h_inf, state_c_inf]\n",
+    "decoder_inf_outputs = decoder_dense(decoder_inf_outputs)\n",
+    "\n",
+    "decoder_model_inf = Model(\n",
+    "    [decoder_inputs] + decoder_states_inputs,\n",
+    "    [decoder_inf_outputs] + decoder_inf_states\n",
+    ")\n",
+    "\n",
+    "# Create index-to-word mapping for the question tokenizer\n",
+    "index_to_word_question = {idx: word for word, idx in tokenizer_question.word_index.items()}\n",
+    "# If you used an OOV token, might want to handle that as well.\n",
+    "\n",
+    "def generate_question(paragraph_text, max_length=50, start_token=None, end_token=None):\n",
+    "    \"\"\"\n",
+    "    Generate a question from a paragraph using the trained seq2seq model.\n",
+    "    Token-level decoding with greedy search.\n",
+    "    \"\"\"\n",
+    "    # 1) Encode the paragraph\n",
+    "    seq = tokenizer_paragraph.texts_to_sequences([paragraph_text])\n",
+    "    seq = pad_sequences(seq, maxlen=max_paragraph_len, padding='post')\n",
+    "    states_value = encoder_model_inf.predict(seq)\n",
+    "\n",
+    "    # 2) Start token\n",
+    "    target_seq = np.zeros((1, 1), dtype='int32')\n",
+    "    # If you have a <START> token, set it here\n",
+    "    # e.g., target_seq[0, 0] = tokenizer_question.word_index[\"<start>\"]\n",
+    "\n",
+    "    decoded_words = []\n",
+    "\n",
+    "    for _ in range(max_length):\n",
+    "        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)\n",
+    "\n",
+    "        sampled_token_index = np.argmax(output_tokens[0, -1, :])\n",
+    "        sampled_word = index_to_word_question.get(sampled_token_index, '<UNK>')\n",
+    "\n",
+    "        # Stop if we encounter an <end> token or a special index\n",
+    "        if end_token and (sampled_word == end_token):\n",
+    "            break\n",
+    "\n",
+    "        decoded_words.append(sampled_word)\n",
+    "\n",
+    "        # Next target\n",
+    "        target_seq = np.zeros((1, 1), dtype='int32')\n",
+    "        target_seq[0, 0] = sampled_token_index\n",
+    "\n",
+    "        states_value = [h, c]\n",
+    "\n",
+    "    return ' '.join(decoded_words)\n",
+    "\n",
+    "# ==========================\n",
+    "# 9) Test Inference on a Paragraph\n",
+    "# ==========================\n",
+    "test_paragraph = \"Albert Einstein was a theoretical physicist born in Germany...\"\n",
+    "generated = generate_question(test_paragraph)\n",
+    "print(\"Generated question:\", generated)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "yups 0\n",
+      "yups 1\n",
+      "yups 2\n",
+      "yups 3\n",
+      "yups 4\n",
+      "yups 5\n",
+      "yups 6\n",
+      "yups 7\n",
+      "yups 8\n",
+      "yups 9\n",
+      "yups 10\n",
+      "yups 11\n",
+      "yups 12\n",
+      "yups 13\n",
+      "yups 14\n",
+      "yups 15\n",
+      "yups 16\n",
+      "yups 17\n",
+      "yups 18\n",
+      "yups 19\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/lstm_question_generator.keras
+++ b/lstm_question_generator.keras
--- a/main.py
+++ b/main.py
@ -0,0 +1,123 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import tensorflow as tf
+import pickle
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import (
+    LSTM,
+    Embedding,
+    Dense,
+    SpatialDropout1D,
+    TimeDistributed,
+)
+from tensorflow.keras.optimizers import Adam
+
+# -----------------------
+# 1. Load dataset
+# -----------------------
+df = pd.read_csv("quiz_questions.csv")
+
+# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong
+df.dropna(subset=["paragraph", "question"], inplace=True)
+
+
+# -----------------------
+# 2. Preprocessing text
+# -----------------------
+def preprocess_text(text):
+    # Contoh preprocessing sederhana
+    text = text.lower()
+    return text
+
+
+df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text)
+df["question"] = df["question"].astype(str).apply(preprocess_text)
+
+# -----------------------
+# 3. Tokenization
+# -----------------------
+# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist())
+vocab_size = len(tokenizer.word_index) + 1  # +1 karena index dimulai dari 1
+
+# Konversi teks menjadi sequences
+X_sequences = tokenizer.texts_to_sequences(df["paragraph"])
+y_sequences = tokenizer.texts_to_sequences(df["question"])
+
+# Cari panjang sequence maksimal (agar uniform untuk padding)
+max_len_paragraph = max(len(seq) for seq in X_sequences)
+max_len_question = max(len(seq) for seq in y_sequences)
+max_length = max(max_len_paragraph, max_len_question)
+
+# Padding sequences (panjangnya disamakan => max_length)
+X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post")
+y_padded = pad_sequences(y_sequences, maxlen=max_length, padding="post")
+
+with open("tokenizer.pkl", "wb") as f:
+    pickle.dump(tokenizer, f)
+print("Tokenizer disimpan ke tokenizer.pkl")
+
+# -----------------------
+# 4. Siapkan X, y
+# -----------------------
+# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy",
+# idealnya y memiliki shape: (num_samples, max_length, 1)
+X = np.array(X_padded)
+y = np.expand_dims(np.array(y_padded), axis=-1)
+
+print("Shape X:", X.shape)
+print("Shape y:", y.shape)  # (batch_size, max_length, 1)
+
+# -----------------------
+# 5. Split data
+# -----------------------
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+print("Train size:", X_train.shape, y_train.shape)
+print("Test size: ", X_test.shape, y_test.shape)
+
+# -----------------------
+# 6. Build Model LSTM
+# -----------------------
+# Kita pakai 2 LSTM stack, masing2 return_sequences=True
+# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim)
+model = Sequential()
+model.add(Embedding(input_dim=vocab_size, output_dim=128))
+model.add(SpatialDropout1D(0.2))
+
+model.add(LSTM(128, return_sequences=True))
+model.add(LSTM(128, return_sequences=True))
+
+# TimeDistributed Dense agar Dense diaplikasikan per timestep
+model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
+
+# -----------------------
+# 7. Compile
+# -----------------------
+model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=Adam(learning_rate=0.001),
+    metrics=["accuracy"],
+)
+
+model.summary()
+
+# -----------------------
+# 8. Train Model
+# -----------------------
+epochs = 10
+history = model.fit(
+    X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32
+)
+
+# -----------------------
+# 9. Save Model
+# -----------------------
+model.save("lstm_question_generator.keras")
+print("Training selesai dan model telah disimpan.")
--- a/quiz_questions.csv
+++ b/quiz_questions.csv
@ -0,0 +1,11 @@
+paragraph,question_type,question,answer,options
+Albert Einstein adalah ilmuwan yang menemukan teori relativitas.,fill_in_the_blank,Siapakah ilmuwan yang menemukan teori relativitas?,Albert Einstein,-
+Isaac Newton menemukan hukum gravitasi setelah melihat apel jatuh dari pohon.,fill_in_the_blank,Siapakah ilmuwan yang menemukan hukum gravitasi?,Isaac Newton,-
+Bumi mengorbit mengelilingi Matahari dalam waktu sekitar 365 hari.,multiple_choice,Berapa lama Bumi mengorbit Matahari?,365 hari,360 hari|365 hari|366 hari|370 hari
+Proklamasi Kemerdekaan Indonesia terjadi pada tanggal 17 Agustus 1945.,fill_in_the_blank,Kapan Proklamasi Kemerdekaan Indonesia terjadi?,17 Agustus 1945,-
+Hewan yang dapat hidup di dua alam disebut amfibi.,true_false,Hewan yang dapat hidup di dua alam disebut reptil.,False,True|False
+Gunung tertinggi di dunia adalah Gunung Everest.,fill_in_the_blank,Gunung apakah yang tertinggi di dunia?,Gunung Everest,-
+Lapisan terluar dari atmosfer Bumi disebut troposfer.,multiple_choice,Apa nama lapisan terluar atmosfer Bumi?,Eksosfer,Troposfer|Stratosfer|Mesosfer|Eksosfer
+Ibu Kota Jepang adalah Tokyo.,true_false,Ibu Kota Jepang adalah Kyoto.,False,True|False
+Fotosintesis adalah proses yang dilakukan tumbuhan untuk membuat makanan sendiri.,fill_in_the_blank,Apa nama proses yang dilakukan tumbuhan untuk membuat makanannya sendiri?,Fotosintesis,-
+Dataran tertinggi di dunia adalah Dataran Tinggi Tibet.,fill_in_the_blank,Apa nama dataran tertinggi di dunia?,Dataran Tinggi Tibet,-
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+numpy
+pandas
+matplotlib
+scikit-learn
+tensorflow
--- a/test.py
+++ b/test.py
@ -0,0 +1,68 @@
+import numpy as np
+import re
+from tensorflow.keras.models import load_model
+
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import pickle
+
+# Misal kita punya tokenizer, model, dan max_length:
+# tokenizer, model, max_length = ...
+# Pastikan Anda load model & tokenizer sesuai environment Anda.
+
+
+def preprocess_text(text):
+    # Fungsi preprocess sederhana (harus sama atau mirip dengan training)
+    text = text.lower()
+    # Buat penyesuaian lain jika perlu
+    return text
+
+
+def generate_question(paragraph, tokenizer, model, max_length):
+    # 1) Preprocess paragraph
+    paragraph = preprocess_text(paragraph)
+
+    # 2) Tokenize
+    seq = tokenizer.texts_to_sequences([paragraph])  # hasilnya list of list
+    # 3) Pad sequence
+    padded = pad_sequences(seq, maxlen=max_length, padding="post")
+
+    # 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size)
+    prediction = model.predict(padded)  # (1, max_length, vocab_size)
+
+    # 5) Cari argmax di setiap time step => (1, max_length)
+    predicted_indices = np.argmax(prediction, axis=-1)[0]
+
+    # 6) Konversi ke kata
+    predicted_words = []
+    for idx in predicted_indices:
+        # Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer
+        if idx == 0:
+            # Boleh langsung break, karena sisanya kemungkinan pad
+            break
+        word = tokenizer.index_word.get(idx, "")
+        predicted_words.append(word)
+
+    # 7) Gabungkan jadi satu kalimat
+    predicted_question = " ".join(predicted_words)
+
+    # Bisa saja kita menambahkan tanda tanya
+    if not predicted_question.endswith("?"):
+        predicted_question = predicted_question + "?"
+
+    return predicted_question
+
+
+model = load_model("lstm_question_generator.keras")
+
+
+
+with open("tokenizer.pkl", "rb") as f:
+    tokenizer = pickle.load(f)
+
+# Pastikan max_length sama seperti saat training
+max_length = 50  # Atau nilai yang Anda tetapkan
+
+paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum."
+
+generated_q = generate_question(paragraph_input, tokenizer, model, max_length)
+print("Generated question:", generated_q)
--- a/tokenizer.pkl
+++ b/tokenizer.pkl
				`@ -0,0 +1 @@`
				`,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4;`