TIF_E41211115_lstm-quiz-gen.../online mcqs.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c9142fcb-39a6-42cb-a38c-629ca17c5ac6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-03-17 14:50:32.718599: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2025-03-17 14:50:32.718943: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-03-17 14:50:32.721006: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-03-17 14:50:32.727572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1742197832.738194   22019 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1742197832.741303   22019 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2025-03-17 14:50:32.752422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    },
    {
     "ename": "OSError",
     "evalue": "[E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 11\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;66;03m# Load spaCy model with word vectors\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43men_core_web_md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# Function to preprocess text\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpreprocess_text\u001b[39m(text):\n",
      "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m     28\u001b[0m     name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m     29\u001b[0m     \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     34\u001b[0m     config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m     35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m     36\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \n\u001b[1;32m     38\u001b[0m \u001b[38;5;124;03m    name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[38;5;124;03m    RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m     50\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     52\u001b[0m \u001b[43m        \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     53\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     54\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     55\u001b[0m \u001b[43m        \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     56\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     57\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     58\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m    470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m    471\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name]))  \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n",
      "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory."
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
    "import spacy\n",
    "import random\n",
    "\n",
    "# Load spaCy model with word vectors\n",
    "nlp = spacy.load(\"en_core_web_md\")  # Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\n",
    "\n",
    "# Function to preprocess text\n",
    "def preprocess_text(text):\n",
    "    doc = nlp(text)\n",
    "    sentences = [sent.text for sent in doc.sents]\n",
    "    return sentences\n",
    "\n",
    "# Function to create training data for LSTM\n",
    "def create_training_data(sentences, tokenizer, max_length):\n",
    "    sequences = tokenizer.texts_to_sequences(sentences)\n",
    "    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')\n",
    "    return padded_sequences\n",
    "\n",
    "# LSTM Model for learning sentence structures\n",
    "def build_lstm_model(vocab_size, max_length, embedding_dim):\n",
    "    model = Sequential([\n",
    "        Embedding(vocab_size, embedding_dim, input_length=max_length),\n",
    "        LSTM(128, return_sequences=True),\n",
    "        Dropout(0.2),\n",
    "        LSTM(64),\n",
    "        Dense(64, activation='relu'),\n",
    "        Dense(vocab_size, activation='softmax')\n",
    "    ])\n",
    "    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "    return model\n",
    "\n",
    "# Function to find similar words using spaCy\n",
    "def find_similar_words(word, num_similar=3):\n",
    "    word_token = nlp.vocab[word] if word in nlp.vocab else None\n",
    "    if not word_token or not word_token.has_vector:\n",
    "        return [\"[Distractor]\"] * num_similar  # Return placeholders if no vector is found\n",
    "\n",
    "    # Compute similarity with other words in vocab\n",
    "    similarities = []\n",
    "    for token in nlp.vocab:\n",
    "        if token.is_alpha and token.has_vector and token != word_token:\n",
    "            similarity = word_token.similarity(token)\n",
    "            similarities.append((token.text, similarity))\n",
    "    \n",
    "    # Sort and return top similar words\n",
    "    similarities.sort(key=lambda x: x[1], reverse=True)\n",
    "    return [word for word, _ in similarities[:num_similar]]\n",
    "\n",
    "# Function to generate MCQs using LSTM and spaCy word embeddings\n",
    "def generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=5):\n",
    "    sentences = preprocess_text(text)\n",
    "    selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))\n",
    "\n",
    "    mcqs = []\n",
    "    for sentence in selected_sentences:\n",
    "        doc = nlp(sentence)\n",
    "        nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n",
    "        if len(nouns) < 1:\n",
    "            continue\n",
    "\n",
    "        subject = random.choice(nouns)\n",
    "        question_stem = sentence.replace(subject, \"______\")\n",
    "\n",
    "        # Generate similar words using spaCy\n",
    "        similar_words = find_similar_words(subject, num_similar=3)\n",
    "\n",
    "        answer_choices = [subject] + similar_words\n",
    "        random.shuffle(answer_choices)\n",
    "        correct_answer = chr(65 + answer_choices.index(subject))\n",
    "\n",
    "        mcqs.append((question_stem, answer_choices, correct_answer))\n",
    "\n",
    "    return mcqs\n",
    "\n",
    "# Example usage\n",
    "text = \"\"\"Deep learning is a subset of machine learning that uses neural networks. LSTMs are useful for processing sequential data like text. \n",
    "Natural language processing involves techniques like tokenization and named entity recognition.\"\"\"\n",
    "\n",
    "# Tokenizer setup\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(preprocess_text(text))\n",
    "vocab_size = len(tokenizer.word_index) + 1\n",
    "max_length = 20\n",
    "\n",
    "# Train LSTM model (Note: Training requires large datasets)\n",
    "model = build_lstm_model(vocab_size, max_length, embedding_dim=100)\n",
    "\n",
    "# Generate MCQs\n",
    "mcqs = generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=3)\n",
    "for i, (q, choices, ans) in enumerate(mcqs, 1):\n",
    "    print(f\"Q{i}: {q}\")\n",
    "    print(f\" A) {choices[0]}  B) {choices[1]}  C) {choices[2]}  D) {choices[3]}\")\n",
    "    print(f\"Correct Answer: {ans}\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62aae7fc-b921-4439-8396-62d7fd8d25d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting en-core-web-md==3.8.0\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)\n",
      "     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--\n",
      "      --------------------------------------- 0.5/33.5 MB 4.2 MB/s eta 0:00:08\n",
      "     -- ------------------------------------- 1.8/33.5 MB 5.6 MB/s eta 0:00:06\n",
      "     --- ------------------------------------ 3.1/33.5 MB 5.8 MB/s eta 0:00:06\n",
      "     ----- ---------------------------------- 4.2/33.5 MB 5.9 MB/s eta 0:00:05\n",
      "     ------ --------------------------------- 5.2/33.5 MB 5.5 MB/s eta 0:00:06\n",
      "     ------- -------------------------------- 6.6/33.5 MB 5.6 MB/s eta 0:00:05\n",
      "     --------- ------------------------------ 7.6/33.5 MB 5.6 MB/s eta 0:00:05\n",
      "     ---------- ----------------------------- 8.4/33.5 MB 5.4 MB/s eta 0:00:05\n",
      "     ----------- ---------------------------- 9.7/33.5 MB 5.5 MB/s eta 0:00:05\n",
      "     ------------ --------------------------- 10.7/33.5 MB 5.5 MB/s eta 0:00:05\n",
      "     -------------- ------------------------- 12.1/33.5 MB 5.5 MB/s eta 0:00:04\n",
      "     --------------- ------------------------ 13.1/33.5 MB 5.5 MB/s eta 0:00:04\n",
      "     ---------------- ----------------------- 14.2/33.5 MB 5.5 MB/s eta 0:00:04\n",
      "     ------------------ --------------------- 15.2/33.5 MB 5.4 MB/s eta 0:00:04\n",
      "     ------------------- -------------------- 16.3/33.5 MB 5.4 MB/s eta 0:00:04\n",
      "     -------------------- ------------------- 17.6/33.5 MB 5.4 MB/s eta 0:00:03\n",
      "     ---------------------- ----------------- 18.9/33.5 MB 5.5 MB/s eta 0:00:03\n",
      "     ------------------------ --------------- 20.2/33.5 MB 5.5 MB/s eta 0:00:03\n",
      "     ------------------------- -------------- 21.8/33.5 MB 5.6 MB/s eta 0:00:03\n",
      "     --------------------------- ------------ 23.1/33.5 MB 5.6 MB/s eta 0:00:02\n",
      "     ---------------------------- ----------- 24.1/33.5 MB 5.7 MB/s eta 0:00:02\n",
      "     ------------------------------ --------- 25.4/33.5 MB 5.7 MB/s eta 0:00:02\n",
      "     ------------------------------- -------- 26.5/33.5 MB 5.6 MB/s eta 0:00:02\n",
      "     -------------------------------- ------- 27.5/33.5 MB 5.6 MB/s eta 0:00:02\n",
      "     ---------------------------------- ----- 28.8/33.5 MB 5.6 MB/s eta 0:00:01\n",
      "     ----------------------------------- ---- 29.9/33.5 MB 5.6 MB/s eta 0:00:01\n",
      "     ------------------------------------ --- 30.9/33.5 MB 5.6 MB/s eta 0:00:01\n",
      "     -------------------------------------- - 32.0/33.5 MB 5.6 MB/s eta 0:00:01\n",
      "     ---------------------------------------  33.0/33.5 MB 5.5 MB/s eta 0:00:01\n",
      "     ---------------------------------------  33.3/33.5 MB 5.5 MB/s eta 0:00:01\n",
      "     ---------------------------------------- 33.5/33.5 MB 5.4 MB/s eta 0:00:00\n",
      "Installing collected packages: en-core-web-md\n",
      "Successfully installed en-core-web-md-3.8.0\n",
      "\u001b[38;5;2m[+] Download and installation successful\u001b[0m\n",
      "You can now load the package via spacy.load('en_core_web_md')\n"
     ]
    }
   ],
   "source": [
    "!python -m spacy download en_core_web_md\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "703acaf0-e703-47ae-b4d2-56cd7236fbd4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc979d1c-2756-41b6-96de-6c76f2bd5f96",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f22536a-3967-486c-a6f7-bd677199800a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "307af48e-a684-4e85-b2df-e963c43ad07c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bec7b11b-7f3a-4a9e-a568-2e382caaa004",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5cd5ef2-c48f-4bd0-bf42-12865cc77149",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}