commit fb6ff8512ba6506aff36a5bd491cdb675ffe0e34 Author: akhdanre Date: Wed Feb 5 02:45:10 2025 +0700 feat: lstm format for generate quiz diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b694934 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv \ No newline at end of file diff --git a/.~lock.quiz_questions.csv# b/.~lock.quiz_questions.csv# new file mode 100644 index 0000000..3a57694 --- /dev/null +++ b/.~lock.quiz_questions.csv# @@ -0,0 +1 @@ +,akeon,fedora,05.02.2025 02:37,file:///home/akeon/.config/libreoffice/4; \ No newline at end of file diff --git a/lstm.ipynb b/lstm.ipynb new file mode 100644 index 0000000..faf2f70 --- /dev/null +++ b/lstm.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-02-05 01:57:25.675154: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n" + ] + }, + { + "data": { + "text/html": [ + "
Model: \"functional\"\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1mModel: \"functional\"\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ Layer (type)         Output Shape          Param #  Connected to      ┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ encoder_inputs      │ (None, None)      │          0 │ -                 │\n",
+       "│ (InputLayer)        │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_inputs      │ (None, None)      │          0 │ -                 │\n",
+       "│ (InputLayer)        │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ embedding           │ (None, None, 128) │      1,280 │ encoder_inputs[0… │\n",
+       "│ (Embedding)         │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ not_equal           │ (None, None)      │          0 │ encoder_inputs[0… │\n",
+       "│ (NotEqual)          │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_embedding   │ (None, None, 128) │      1,024 │ decoder_inputs[0… │\n",
+       "│ (Embedding)         │                   │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ encoder_lstm (LSTM) │ [(None, 256),     │    394,240 │ embedding[0][0],  │\n",
+       "│                     │ (None, 256),      │            │ not_equal[0][0]   │\n",
+       "│                     │ (None, 256)]      │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_lstm (LSTM) │ [(None, None,     │    394,240 │ decoder_embeddin… │\n",
+       "│                     │ 256), (None,      │            │ encoder_lstm[0][ │\n",
+       "│                     │ 256), (None,      │            │ encoder_lstm[0][ │\n",
+       "│                     │ 256)]             │            │                   │\n",
+       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
+       "│ decoder_dense       │ (None, None, 8)   │      2,056 │ decoder_lstm[0][ │\n",
+       "│ (Dense)             │                   │            │                   │\n",
+       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
+       "
\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", + "│ encoder_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", + "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ decoder_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", + "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,280\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n", + "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ not_equal │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n", + "│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ decoder_embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │ decoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n", + "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m394,240\u001b[0m │ embedding[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n", + "│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n", + "│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ decoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, │ \u001b[38;5;34m394,240\u001b[0m │ decoder_embeddin… │\n", + "│ │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m, │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", + "│ │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m, │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", + "│ │ \u001b[38;5;34m256\u001b[0m)] │ │ │\n", + "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", + "│ decoder_dense │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m2,056\u001b[0m │ decoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", + "│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n", + "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Total params: 792,840 (3.02 MB)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Trainable params: 792,840 (3.02 MB)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
 Non-trainable params: 0 (0.00 B)\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n", + "Epoch 1/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-02-05 01:57:27.530017: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n", + "2025-02-05 01:57:27.593630: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence\n", + "\t [[{{node IteratorGetNext}}]]\n", + "/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/trainers/epoch_iterator.py:151: UserWarning: Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least `steps_per_epoch * epochs` batches. You may need to use the `.repeat()` function when building your dataset.\n", + " self._interrupted_warning()\n" + ] + }, + { + "ename": "ValueError", + "evalue": "math domain error", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m 113\u001b[0m target_val \u001b[38;5;241m=\u001b[39m decoder_target_data[split_index:]\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# 6) Fit the Model\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_train\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 121\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 122\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_val\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_val\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;66;03m# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\u001b[39;00m\n\u001b[1;32m 127\u001b[0m \n\u001b[1;32m 128\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;66;03m# 7) Evaluate the Model\u001b[39;00m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;66;03m# If you want a quick evaluation on the validation set:\u001b[39;00m\n\u001b[1;32m 132\u001b[0m val_loss, val_accuracy \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mevaluate([encoder_val, decoder_val], target_val)\n", + "File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback..error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 119\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n", + "File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/progbar.py:119\u001b[0m, in \u001b[0;36mProgbar.update\u001b[0;34m(self, current, values, finalize)\u001b[0m\n\u001b[1;32m 116\u001b[0m message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 119\u001b[0m numdigits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(\u001b[43mmath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog10\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 120\u001b[0m bar \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(numdigits) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124md/\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m%\u001b[39m (current, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget)\n\u001b[1;32m 121\u001b[0m bar \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[1m\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[0m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mValueError\u001b[0m: math domain error" + ] + } + ], + "source": [ + "# ==========================\n", + "# 1) Install/Import Dependencies\n", + "# ==========================\n", + "# If you are in a brand new environment, uncomment the following line:\n", + "# %pip install tensorflow pandas\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.layers import Input, LSTM, Embedding, Dense\n", + "from tensorflow.keras.models import Model\n", + "\n", + "# ==========================\n", + "# 2) Load Dataset (CSV)\n", + "# ==========================\n", + "# Adjust the file path to your CSV file\n", + "df = pd.read_csv(\"quiz_questions.csv\")\n", + "\n", + "# Extract the paragraphs and questions\n", + "paragraphs = df['paragraph'].astype(str).tolist()\n", + "questions = df['question'].astype(str).tolist()\n", + "\n", + "# (Optional) For demonstration, let's ignore question_type, answer, distractors in this example\n", + "# but you can incorporate them as extra signals if you wish.\n", + "\n", + "# ==========================\n", + "# 3) Tokenize Text\n", + "# ==========================\n", + "# Create two tokenizers: one for paragraphs, one for questions\n", + "num_words = 10000 # Maximum vocabulary size\n", + "\n", + "tokenizer_paragraph = Tokenizer(num_words=num_words, oov_token=\"\")\n", + "tokenizer_paragraph.fit_on_texts(paragraphs)\n", + "paragraph_sequences = tokenizer_paragraph.texts_to_sequences(paragraphs)\n", + "\n", + "tokenizer_question = Tokenizer(num_words=num_words, oov_token=\"\")\n", + "tokenizer_question.fit_on_texts(questions)\n", + "question_sequences = tokenizer_question.texts_to_sequences(questions)\n", + "\n", + "# Get max lengths (for padding)\n", + "max_paragraph_len = max(len(seq) for seq in paragraph_sequences)\n", + "max_question_len = max(len(seq) for seq in question_sequences)\n", + "\n", + "# Pad sequences\n", + "encoder_input_data = pad_sequences(paragraph_sequences, maxlen=max_paragraph_len, padding='post')\n", + "# For decoder data, we usually do teacher forcing:\n", + "# We'll keep one version as input, one version shifted as the target\n", + "decoder_input_data_full = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')\n", + "\n", + "# We create decoder_target_data by shifting to the left by 1 token\n", + "decoder_target_data = np.copy(decoder_input_data_full[:, 1:])\n", + "decoder_input_data = np.copy(decoder_input_data_full[:, :-1])\n", + "\n", + "# Expand target dimension for sparse_categorical_crossentropy\n", + "decoder_target_data = np.expand_dims(decoder_target_data, -1)\n", + "\n", + "# Calculate vocab sizes\n", + "vocab_size_paragraph = min(len(tokenizer_paragraph.word_index) + 1, num_words)\n", + "vocab_size_question = min(len(tokenizer_question.word_index) + 1, num_words)\n", + "\n", + "# ==========================\n", + "# 4) Build Seq2Seq Model\n", + "# ==========================\n", + "embedding_dim = 128\n", + "latent_dim = 256 # LSTM hidden dimension\n", + "\n", + "# ----- Encoder -----\n", + "encoder_inputs = Input(shape=(None,), name=\"encoder_inputs\")\n", + "encoder_embedding = Embedding(input_dim=vocab_size_paragraph,\n", + " output_dim=embedding_dim,\n", + " mask_zero=True)(encoder_inputs)\n", + "\n", + "encoder_lstm = LSTM(latent_dim, return_state=True, name=\"encoder_lstm\")\n", + "_, state_h, state_c = encoder_lstm(encoder_embedding)\n", + "\n", + "encoder_states = [state_h, state_c]\n", + "\n", + "# ----- Decoder -----\n", + "decoder_inputs = Input(shape=(None,), name=\"decoder_inputs\")\n", + "decoder_embedding = Embedding(input_dim=vocab_size_question,\n", + " output_dim=embedding_dim,\n", + " mask_zero=True,\n", + " name=\"decoder_embedding\")(decoder_inputs)\n", + "\n", + "decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name=\"decoder_lstm\")\n", + "decoder_outputs, _, _ = decoder_lstm(decoder_embedding,\n", + " initial_state=encoder_states)\n", + "decoder_dense = Dense(vocab_size_question, activation='softmax', name=\"decoder_dense\")\n", + "decoder_outputs = decoder_dense(decoder_outputs)\n", + "\n", + "# Combine into a training model\n", + "model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n", + "model.compile(optimizer='adam',\n", + " loss='sparse_categorical_crossentropy',\n", + " metrics=['sparse_categorical_accuracy'])\n", + "\n", + "print(model.summary())\n", + "\n", + "# ==========================\n", + "# 5) Train/Test Split (Optional)\n", + "# ==========================\n", + "# For simplicity, let's do a quick train/validation split\n", + "# Adjust split size or do a separate test set for production usage.\n", + "split_index = int(0.8 * len(encoder_input_data))\n", + "encoder_train = encoder_input_data[:split_index]\n", + "decoder_train = decoder_input_data[:split_index]\n", + "target_train = decoder_target_data[:split_index]\n", + "\n", + "encoder_val = encoder_input_data[split_index:]\n", + "decoder_val = decoder_input_data[split_index:]\n", + "target_val = decoder_target_data[split_index:]\n", + "\n", + "# ==========================\n", + "# 6) Fit the Model\n", + "# ==========================\n", + "history = model.fit(\n", + " [encoder_train, decoder_train],\n", + " target_train,\n", + " batch_size=32,\n", + " epochs=10,\n", + " validation_data=([encoder_val, decoder_val], target_val)\n", + ")\n", + "\n", + "# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\n", + "\n", + "# ==========================\n", + "# 7) Evaluate the Model\n", + "# ==========================\n", + "# If you want a quick evaluation on the validation set:\n", + "val_loss, val_accuracy = model.evaluate([encoder_val, decoder_val], target_val)\n", + "print(f\"Validation Loss: {val_loss:.4f}\")\n", + "print(f\"Validation Accuracy (token-level): {val_accuracy:.4f}\")\n", + "\n", + "# ==========================\n", + "# 8) Build Inference Models\n", + "# ==========================\n", + "# Encoder model for inference\n", + "encoder_model_inf = Model(encoder_inputs, encoder_states)\n", + "\n", + "# Decoder model for inference\n", + "decoder_state_input_h = Input(shape=(latent_dim,), name=\"inference_state_h\")\n", + "decoder_state_input_c = Input(shape=(latent_dim,), name=\"inference_state_c\")\n", + "decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n", + "\n", + "dec_emb_inf = decoder_embedding(decoder_inputs)\n", + "decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(\n", + " dec_emb_inf, initial_state=decoder_states_inputs\n", + ")\n", + "decoder_inf_states = [state_h_inf, state_c_inf]\n", + "decoder_inf_outputs = decoder_dense(decoder_inf_outputs)\n", + "\n", + "decoder_model_inf = Model(\n", + " [decoder_inputs] + decoder_states_inputs,\n", + " [decoder_inf_outputs] + decoder_inf_states\n", + ")\n", + "\n", + "# Create index-to-word mapping for the question tokenizer\n", + "index_to_word_question = {idx: word for word, idx in tokenizer_question.word_index.items()}\n", + "# If you used an OOV token, might want to handle that as well.\n", + "\n", + "def generate_question(paragraph_text, max_length=50, start_token=None, end_token=None):\n", + " \"\"\"\n", + " Generate a question from a paragraph using the trained seq2seq model.\n", + " Token-level decoding with greedy search.\n", + " \"\"\"\n", + " # 1) Encode the paragraph\n", + " seq = tokenizer_paragraph.texts_to_sequences([paragraph_text])\n", + " seq = pad_sequences(seq, maxlen=max_paragraph_len, padding='post')\n", + " states_value = encoder_model_inf.predict(seq)\n", + "\n", + " # 2) Start token\n", + " target_seq = np.zeros((1, 1), dtype='int32')\n", + " # If you have a token, set it here\n", + " # e.g., target_seq[0, 0] = tokenizer_question.word_index[\"\"]\n", + "\n", + " decoded_words = []\n", + "\n", + " for _ in range(max_length):\n", + " output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)\n", + "\n", + " sampled_token_index = np.argmax(output_tokens[0, -1, :])\n", + " sampled_word = index_to_word_question.get(sampled_token_index, '')\n", + "\n", + " # Stop if we encounter an token or a special index\n", + " if end_token and (sampled_word == end_token):\n", + " break\n", + "\n", + " decoded_words.append(sampled_word)\n", + "\n", + " # Next target\n", + " target_seq = np.zeros((1, 1), dtype='int32')\n", + " target_seq[0, 0] = sampled_token_index\n", + "\n", + " states_value = [h, c]\n", + "\n", + " return ' '.join(decoded_words)\n", + "\n", + "# ==========================\n", + "# 9) Test Inference on a Paragraph\n", + "# ==========================\n", + "test_paragraph = \"Albert Einstein was a theoretical physicist born in Germany...\"\n", + "generated = generate_question(test_paragraph)\n", + "print(\"Generated question:\", generated)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "yups 0\n", + "yups 1\n", + "yups 2\n", + "yups 3\n", + "yups 4\n", + "yups 5\n", + "yups 6\n", + "yups 7\n", + "yups 8\n", + "yups 9\n", + "yups 10\n", + "yups 11\n", + "yups 12\n", + "yups 13\n", + "yups 14\n", + "yups 15\n", + "yups 16\n", + "yups 17\n", + "yups 18\n", + "yups 19\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lstm_question_generator.keras b/lstm_question_generator.keras new file mode 100644 index 0000000..7aae8fa Binary files /dev/null and b/lstm_question_generator.keras differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..f5d2376 --- /dev/null +++ b/main.py @@ -0,0 +1,123 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import tensorflow as tf +import pickle +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import ( + LSTM, + Embedding, + Dense, + SpatialDropout1D, + TimeDistributed, +) +from tensorflow.keras.optimizers import Adam + +# ----------------------- +# 1. Load dataset +# ----------------------- +df = pd.read_csv("quiz_questions.csv") + +# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong +df.dropna(subset=["paragraph", "question"], inplace=True) + + +# ----------------------- +# 2. Preprocessing text +# ----------------------- +def preprocess_text(text): + # Contoh preprocessing sederhana + text = text.lower() + return text + + +df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text) +df["question"] = df["question"].astype(str).apply(preprocess_text) + +# ----------------------- +# 3. Tokenization +# ----------------------- +# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya +tokenizer = Tokenizer() +tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist()) +vocab_size = len(tokenizer.word_index) + 1 # +1 karena index dimulai dari 1 + +# Konversi teks menjadi sequences +X_sequences = tokenizer.texts_to_sequences(df["paragraph"]) +y_sequences = tokenizer.texts_to_sequences(df["question"]) + +# Cari panjang sequence maksimal (agar uniform untuk padding) +max_len_paragraph = max(len(seq) for seq in X_sequences) +max_len_question = max(len(seq) for seq in y_sequences) +max_length = max(max_len_paragraph, max_len_question) + +# Padding sequences (panjangnya disamakan => max_length) +X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post") +y_padded = pad_sequences(y_sequences, maxlen=max_length, padding="post") + +with open("tokenizer.pkl", "wb") as f: + pickle.dump(tokenizer, f) +print("Tokenizer disimpan ke tokenizer.pkl") + +# ----------------------- +# 4. Siapkan X, y +# ----------------------- +# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy", +# idealnya y memiliki shape: (num_samples, max_length, 1) +X = np.array(X_padded) +y = np.expand_dims(np.array(y_padded), axis=-1) + +print("Shape X:", X.shape) +print("Shape y:", y.shape) # (batch_size, max_length, 1) + +# ----------------------- +# 5. Split data +# ----------------------- +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 +) + +print("Train size:", X_train.shape, y_train.shape) +print("Test size: ", X_test.shape, y_test.shape) + +# ----------------------- +# 6. Build Model LSTM +# ----------------------- +# Kita pakai 2 LSTM stack, masing2 return_sequences=True +# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim) +model = Sequential() +model.add(Embedding(input_dim=vocab_size, output_dim=128)) +model.add(SpatialDropout1D(0.2)) + +model.add(LSTM(128, return_sequences=True)) +model.add(LSTM(128, return_sequences=True)) + +# TimeDistributed Dense agar Dense diaplikasikan per timestep +model.add(TimeDistributed(Dense(vocab_size, activation="softmax"))) + +# ----------------------- +# 7. Compile +# ----------------------- +model.compile( + loss="sparse_categorical_crossentropy", + optimizer=Adam(learning_rate=0.001), + metrics=["accuracy"], +) + +model.summary() + +# ----------------------- +# 8. Train Model +# ----------------------- +epochs = 10 +history = model.fit( + X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32 +) + +# ----------------------- +# 9. Save Model +# ----------------------- +model.save("lstm_question_generator.keras") +print("Training selesai dan model telah disimpan.") diff --git a/quiz_questions.csv b/quiz_questions.csv new file mode 100644 index 0000000..e804e8c --- /dev/null +++ b/quiz_questions.csv @@ -0,0 +1,11 @@ +paragraph,question_type,question,answer,options +Albert Einstein adalah ilmuwan yang menemukan teori relativitas.,fill_in_the_blank,Siapakah ilmuwan yang menemukan teori relativitas?,Albert Einstein,- +Isaac Newton menemukan hukum gravitasi setelah melihat apel jatuh dari pohon.,fill_in_the_blank,Siapakah ilmuwan yang menemukan hukum gravitasi?,Isaac Newton,- +Bumi mengorbit mengelilingi Matahari dalam waktu sekitar 365 hari.,multiple_choice,Berapa lama Bumi mengorbit Matahari?,365 hari,360 hari|365 hari|366 hari|370 hari +Proklamasi Kemerdekaan Indonesia terjadi pada tanggal 17 Agustus 1945.,fill_in_the_blank,Kapan Proklamasi Kemerdekaan Indonesia terjadi?,17 Agustus 1945,- +Hewan yang dapat hidup di dua alam disebut amfibi.,true_false,Hewan yang dapat hidup di dua alam disebut reptil.,False,True|False +Gunung tertinggi di dunia adalah Gunung Everest.,fill_in_the_blank,Gunung apakah yang tertinggi di dunia?,Gunung Everest,- +Lapisan terluar dari atmosfer Bumi disebut troposfer.,multiple_choice,Apa nama lapisan terluar atmosfer Bumi?,Eksosfer,Troposfer|Stratosfer|Mesosfer|Eksosfer +Ibu Kota Jepang adalah Tokyo.,true_false,Ibu Kota Jepang adalah Kyoto.,False,True|False +Fotosintesis adalah proses yang dilakukan tumbuhan untuk membuat makanan sendiri.,fill_in_the_blank,Apa nama proses yang dilakukan tumbuhan untuk membuat makanannya sendiri?,Fotosintesis,- +Dataran tertinggi di dunia adalah Dataran Tinggi Tibet.,fill_in_the_blank,Apa nama dataran tertinggi di dunia?,Dataran Tinggi Tibet,- diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3dd66b9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy +pandas +matplotlib +scikit-learn +tensorflow diff --git a/test.py b/test.py new file mode 100644 index 0000000..16f4573 --- /dev/null +++ b/test.py @@ -0,0 +1,68 @@ +import numpy as np +import re +from tensorflow.keras.models import load_model + +from tensorflow.keras.preprocessing.sequence import pad_sequences +import pickle + +# Misal kita punya tokenizer, model, dan max_length: +# tokenizer, model, max_length = ... +# Pastikan Anda load model & tokenizer sesuai environment Anda. + + +def preprocess_text(text): + # Fungsi preprocess sederhana (harus sama atau mirip dengan training) + text = text.lower() + # Buat penyesuaian lain jika perlu + return text + + +def generate_question(paragraph, tokenizer, model, max_length): + # 1) Preprocess paragraph + paragraph = preprocess_text(paragraph) + + # 2) Tokenize + seq = tokenizer.texts_to_sequences([paragraph]) # hasilnya list of list + # 3) Pad sequence + padded = pad_sequences(seq, maxlen=max_length, padding="post") + + # 4) Dapatkan prediksi dari model => shape: (1, max_length, vocab_size) + prediction = model.predict(padded) # (1, max_length, vocab_size) + + # 5) Cari argmax di setiap time step => (1, max_length) + predicted_indices = np.argmax(prediction, axis=-1)[0] + + # 6) Konversi ke kata + predicted_words = [] + for idx in predicted_indices: + # Kalau idx = 0, biasanya berarti token 'unknown' atau 'pad', tergantung setting tokenizer + if idx == 0: + # Boleh langsung break, karena sisanya kemungkinan pad + break + word = tokenizer.index_word.get(idx, "") + predicted_words.append(word) + + # 7) Gabungkan jadi satu kalimat + predicted_question = " ".join(predicted_words) + + # Bisa saja kita menambahkan tanda tanya + if not predicted_question.endswith("?"): + predicted_question = predicted_question + "?" + + return predicted_question + + +model = load_model("lstm_question_generator.keras") + + + +with open("tokenizer.pkl", "rb") as f: + tokenizer = pickle.load(f) + +# Pastikan max_length sama seperti saat training +max_length = 50 # Atau nilai yang Anda tetapkan + +paragraph_input = "Albert Einstein mengembangkan teori relativitas dan membantu mengembangkan fisika kuantum." + +generated_q = generate_question(paragraph_input, tokenizer, model, max_length) +print("Generated question:", generated_q) diff --git a/tokenizer.pkl b/tokenizer.pkl new file mode 100644 index 0000000..209819a Binary files /dev/null and b/tokenizer.pkl differ