{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c9142fcb-39a6-42cb-a38c-629ca17c5ac6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-03-17 14:50:32.718599: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2025-03-17 14:50:32.718943: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", "2025-03-17 14:50:32.721006: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", "2025-03-17 14:50:32.727572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "E0000 00:00:1742197832.738194 22019 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "E0000 00:00:1742197832.741303 22019 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2025-03-17 14:50:32.752422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { "ename": "OSError", "evalue": "[E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Load spaCy model with word vectors\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43men_core_web_md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# Function to preprocess text\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpreprocess_text\u001b[39m(text):\n", "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 28\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 29\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 34\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n", "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory." ] } ], "source": [ "import numpy as np\n", "import tensorflow as tf\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n", "import spacy\n", "import random\n", "\n", "# Load spaCy model with word vectors\n", "nlp = spacy.load(\"en_core_web_md\") # Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\n", "\n", "# Function to preprocess text\n", "def preprocess_text(text):\n", " doc = nlp(text)\n", " sentences = [sent.text for sent in doc.sents]\n", " return sentences\n", "\n", "# Function to create training data for LSTM\n", "def create_training_data(sentences, tokenizer, max_length):\n", " sequences = tokenizer.texts_to_sequences(sentences)\n", " padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')\n", " return padded_sequences\n", "\n", "# LSTM Model for learning sentence structures\n", "def build_lstm_model(vocab_size, max_length, embedding_dim):\n", " model = Sequential([\n", " Embedding(vocab_size, embedding_dim, input_length=max_length),\n", " LSTM(128, return_sequences=True),\n", " Dropout(0.2),\n", " LSTM(64),\n", " Dense(64, activation='relu'),\n", " Dense(vocab_size, activation='softmax')\n", " ])\n", " model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", " return model\n", "\n", "# Function to find similar words using spaCy\n", "def find_similar_words(word, num_similar=3):\n", " word_token = nlp.vocab[word] if word in nlp.vocab else None\n", " if not word_token or not word_token.has_vector:\n", " return [\"[Distractor]\"] * num_similar # Return placeholders if no vector is found\n", "\n", " # Compute similarity with other words in vocab\n", " similarities = []\n", " for token in nlp.vocab:\n", " if token.is_alpha and token.has_vector and token != word_token:\n", " similarity = word_token.similarity(token)\n", " similarities.append((token.text, similarity))\n", " \n", " # Sort and return top similar words\n", " similarities.sort(key=lambda x: x[1], reverse=True)\n", " return [word for word, _ in similarities[:num_similar]]\n", "\n", "# Function to generate MCQs using LSTM and spaCy word embeddings\n", "def generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=5):\n", " sentences = preprocess_text(text)\n", " selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))\n", "\n", " mcqs = []\n", " for sentence in selected_sentences:\n", " doc = nlp(sentence)\n", " nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n", " if len(nouns) < 1:\n", " continue\n", "\n", " subject = random.choice(nouns)\n", " question_stem = sentence.replace(subject, \"______\")\n", "\n", " # Generate similar words using spaCy\n", " similar_words = find_similar_words(subject, num_similar=3)\n", "\n", " answer_choices = [subject] + similar_words\n", " random.shuffle(answer_choices)\n", " correct_answer = chr(65 + answer_choices.index(subject))\n", "\n", " mcqs.append((question_stem, answer_choices, correct_answer))\n", "\n", " return mcqs\n", "\n", "# Example usage\n", "text = \"\"\"Deep learning is a subset of machine learning that uses neural networks. LSTMs are useful for processing sequential data like text. \n", "Natural language processing involves techniques like tokenization and named entity recognition.\"\"\"\n", "\n", "# Tokenizer setup\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(preprocess_text(text))\n", "vocab_size = len(tokenizer.word_index) + 1\n", "max_length = 20\n", "\n", "# Train LSTM model (Note: Training requires large datasets)\n", "model = build_lstm_model(vocab_size, max_length, embedding_dim=100)\n", "\n", "# Generate MCQs\n", "mcqs = generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=3)\n", "for i, (q, choices, ans) in enumerate(mcqs, 1):\n", " print(f\"Q{i}: {q}\")\n", " print(f\" A) {choices[0]} B) {choices[1]} C) {choices[2]} D) {choices[3]}\")\n", " print(f\"Correct Answer: {ans}\\n\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "62aae7fc-b921-4439-8396-62d7fd8d25d5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-md==3.8.0\n", " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)\n", " ---------------------------------------- 0.0/33.5 MB ? eta -:--:--\n", " --------------------------------------- 0.5/33.5 MB 4.2 MB/s eta 0:00:08\n", " -- ------------------------------------- 1.8/33.5 MB 5.6 MB/s eta 0:00:06\n", " --- ------------------------------------ 3.1/33.5 MB 5.8 MB/s eta 0:00:06\n", " ----- ---------------------------------- 4.2/33.5 MB 5.9 MB/s eta 0:00:05\n", " ------ --------------------------------- 5.2/33.5 MB 5.5 MB/s eta 0:00:06\n", " ------- -------------------------------- 6.6/33.5 MB 5.6 MB/s eta 0:00:05\n", " --------- ------------------------------ 7.6/33.5 MB 5.6 MB/s eta 0:00:05\n", " ---------- ----------------------------- 8.4/33.5 MB 5.4 MB/s eta 0:00:05\n", " ----------- ---------------------------- 9.7/33.5 MB 5.5 MB/s eta 0:00:05\n", " ------------ --------------------------- 10.7/33.5 MB 5.5 MB/s eta 0:00:05\n", " -------------- ------------------------- 12.1/33.5 MB 5.5 MB/s eta 0:00:04\n", " --------------- ------------------------ 13.1/33.5 MB 5.5 MB/s eta 0:00:04\n", " ---------------- ----------------------- 14.2/33.5 MB 5.5 MB/s eta 0:00:04\n", " ------------------ --------------------- 15.2/33.5 MB 5.4 MB/s eta 0:00:04\n", " ------------------- -------------------- 16.3/33.5 MB 5.4 MB/s eta 0:00:04\n", " -------------------- ------------------- 17.6/33.5 MB 5.4 MB/s eta 0:00:03\n", " ---------------------- ----------------- 18.9/33.5 MB 5.5 MB/s eta 0:00:03\n", " ------------------------ --------------- 20.2/33.5 MB 5.5 MB/s eta 0:00:03\n", " ------------------------- -------------- 21.8/33.5 MB 5.6 MB/s eta 0:00:03\n", " --------------------------- ------------ 23.1/33.5 MB 5.6 MB/s eta 0:00:02\n", " ---------------------------- ----------- 24.1/33.5 MB 5.7 MB/s eta 0:00:02\n", " ------------------------------ --------- 25.4/33.5 MB 5.7 MB/s eta 0:00:02\n", " ------------------------------- -------- 26.5/33.5 MB 5.6 MB/s eta 0:00:02\n", " -------------------------------- ------- 27.5/33.5 MB 5.6 MB/s eta 0:00:02\n", " ---------------------------------- ----- 28.8/33.5 MB 5.6 MB/s eta 0:00:01\n", " ----------------------------------- ---- 29.9/33.5 MB 5.6 MB/s eta 0:00:01\n", " ------------------------------------ --- 30.9/33.5 MB 5.6 MB/s eta 0:00:01\n", " -------------------------------------- - 32.0/33.5 MB 5.6 MB/s eta 0:00:01\n", " --------------------------------------- 33.0/33.5 MB 5.5 MB/s eta 0:00:01\n", " --------------------------------------- 33.3/33.5 MB 5.5 MB/s eta 0:00:01\n", " ---------------------------------------- 33.5/33.5 MB 5.4 MB/s eta 0:00:00\n", "Installing collected packages: en-core-web-md\n", "Successfully installed en-core-web-md-3.8.0\n", "\u001b[38;5;2m[+] Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('en_core_web_md')\n" ] } ], "source": [ "!python -m spacy download en_core_web_md\n" ] }, { "cell_type": "code", "execution_count": null, "id": "703acaf0-e703-47ae-b4d2-56cd7236fbd4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "cc979d1c-2756-41b6-96de-6c76f2bd5f96", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4f22536a-3967-486c-a6f7-bd677199800a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "307af48e-a684-4e85-b2df-e963c43ad07c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bec7b11b-7f3a-4a9e-a568-2e382caaa004", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a5cd5ef2-c48f-4bd0-bf42-12865cc77149", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }