TIF_E41211115_lstm-quiz-gen.../online mcqs.ipynb

266 lines
16 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c9142fcb-39a6-42cb-a38c-629ca17c5ac6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-03-17 14:50:32.718599: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-03-17 14:50:32.718943: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-03-17 14:50:32.721006: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-03-17 14:50:32.727572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1742197832.738194 22019 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1742197832.741303 22019 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2025-03-17 14:50:32.752422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
},
{
"ename": "OSError",
"evalue": "[E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Load spaCy model with word vectors\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43men_core_web_md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# Function to preprocess text\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpreprocess_text\u001b[39m(text):\n",
"File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 28\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 29\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 34\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n",
"\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory."
]
}
],
"source": [
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
"import spacy\n",
"import random\n",
"\n",
"# Load spaCy model with word vectors\n",
"nlp = spacy.load(\"en_core_web_md\") # Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\n",
"\n",
"# Function to preprocess text\n",
"def preprocess_text(text):\n",
" doc = nlp(text)\n",
" sentences = [sent.text for sent in doc.sents]\n",
" return sentences\n",
"\n",
"# Function to create training data for LSTM\n",
"def create_training_data(sentences, tokenizer, max_length):\n",
" sequences = tokenizer.texts_to_sequences(sentences)\n",
" padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')\n",
" return padded_sequences\n",
"\n",
"# LSTM Model for learning sentence structures\n",
"def build_lstm_model(vocab_size, max_length, embedding_dim):\n",
" model = Sequential([\n",
" Embedding(vocab_size, embedding_dim, input_length=max_length),\n",
" LSTM(128, return_sequences=True),\n",
" Dropout(0.2),\n",
" LSTM(64),\n",
" Dense(64, activation='relu'),\n",
" Dense(vocab_size, activation='softmax')\n",
" ])\n",
" model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
" return model\n",
"\n",
"# Function to find similar words using spaCy\n",
"def find_similar_words(word, num_similar=3):\n",
" word_token = nlp.vocab[word] if word in nlp.vocab else None\n",
" if not word_token or not word_token.has_vector:\n",
" return [\"[Distractor]\"] * num_similar # Return placeholders if no vector is found\n",
"\n",
" # Compute similarity with other words in vocab\n",
" similarities = []\n",
" for token in nlp.vocab:\n",
" if token.is_alpha and token.has_vector and token != word_token:\n",
" similarity = word_token.similarity(token)\n",
" similarities.append((token.text, similarity))\n",
" \n",
" # Sort and return top similar words\n",
" similarities.sort(key=lambda x: x[1], reverse=True)\n",
" return [word for word, _ in similarities[:num_similar]]\n",
"\n",
"# Function to generate MCQs using LSTM and spaCy word embeddings\n",
"def generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=5):\n",
" sentences = preprocess_text(text)\n",
" selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))\n",
"\n",
" mcqs = []\n",
" for sentence in selected_sentences:\n",
" doc = nlp(sentence)\n",
" nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n",
" if len(nouns) < 1:\n",
" continue\n",
"\n",
" subject = random.choice(nouns)\n",
" question_stem = sentence.replace(subject, \"______\")\n",
"\n",
" # Generate similar words using spaCy\n",
" similar_words = find_similar_words(subject, num_similar=3)\n",
"\n",
" answer_choices = [subject] + similar_words\n",
" random.shuffle(answer_choices)\n",
" correct_answer = chr(65 + answer_choices.index(subject))\n",
"\n",
" mcqs.append((question_stem, answer_choices, correct_answer))\n",
"\n",
" return mcqs\n",
"\n",
"# Example usage\n",
"text = \"\"\"Deep learning is a subset of machine learning that uses neural networks. LSTMs are useful for processing sequential data like text. \n",
"Natural language processing involves techniques like tokenization and named entity recognition.\"\"\"\n",
"\n",
"# Tokenizer setup\n",
"tokenizer = Tokenizer()\n",
"tokenizer.fit_on_texts(preprocess_text(text))\n",
"vocab_size = len(tokenizer.word_index) + 1\n",
"max_length = 20\n",
"\n",
"# Train LSTM model (Note: Training requires large datasets)\n",
"model = build_lstm_model(vocab_size, max_length, embedding_dim=100)\n",
"\n",
"# Generate MCQs\n",
"mcqs = generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=3)\n",
"for i, (q, choices, ans) in enumerate(mcqs, 1):\n",
" print(f\"Q{i}: {q}\")\n",
" print(f\" A) {choices[0]} B) {choices[1]} C) {choices[2]} D) {choices[3]}\")\n",
" print(f\"Correct Answer: {ans}\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62aae7fc-b921-4439-8396-62d7fd8d25d5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting en-core-web-md==3.8.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)\n",
" ---------------------------------------- 0.0/33.5 MB ? eta -:--:--\n",
" --------------------------------------- 0.5/33.5 MB 4.2 MB/s eta 0:00:08\n",
" -- ------------------------------------- 1.8/33.5 MB 5.6 MB/s eta 0:00:06\n",
" --- ------------------------------------ 3.1/33.5 MB 5.8 MB/s eta 0:00:06\n",
" ----- ---------------------------------- 4.2/33.5 MB 5.9 MB/s eta 0:00:05\n",
" ------ --------------------------------- 5.2/33.5 MB 5.5 MB/s eta 0:00:06\n",
" ------- -------------------------------- 6.6/33.5 MB 5.6 MB/s eta 0:00:05\n",
" --------- ------------------------------ 7.6/33.5 MB 5.6 MB/s eta 0:00:05\n",
" ---------- ----------------------------- 8.4/33.5 MB 5.4 MB/s eta 0:00:05\n",
" ----------- ---------------------------- 9.7/33.5 MB 5.5 MB/s eta 0:00:05\n",
" ------------ --------------------------- 10.7/33.5 MB 5.5 MB/s eta 0:00:05\n",
" -------------- ------------------------- 12.1/33.5 MB 5.5 MB/s eta 0:00:04\n",
" --------------- ------------------------ 13.1/33.5 MB 5.5 MB/s eta 0:00:04\n",
" ---------------- ----------------------- 14.2/33.5 MB 5.5 MB/s eta 0:00:04\n",
" ------------------ --------------------- 15.2/33.5 MB 5.4 MB/s eta 0:00:04\n",
" ------------------- -------------------- 16.3/33.5 MB 5.4 MB/s eta 0:00:04\n",
" -------------------- ------------------- 17.6/33.5 MB 5.4 MB/s eta 0:00:03\n",
" ---------------------- ----------------- 18.9/33.5 MB 5.5 MB/s eta 0:00:03\n",
" ------------------------ --------------- 20.2/33.5 MB 5.5 MB/s eta 0:00:03\n",
" ------------------------- -------------- 21.8/33.5 MB 5.6 MB/s eta 0:00:03\n",
" --------------------------- ------------ 23.1/33.5 MB 5.6 MB/s eta 0:00:02\n",
" ---------------------------- ----------- 24.1/33.5 MB 5.7 MB/s eta 0:00:02\n",
" ------------------------------ --------- 25.4/33.5 MB 5.7 MB/s eta 0:00:02\n",
" ------------------------------- -------- 26.5/33.5 MB 5.6 MB/s eta 0:00:02\n",
" -------------------------------- ------- 27.5/33.5 MB 5.6 MB/s eta 0:00:02\n",
" ---------------------------------- ----- 28.8/33.5 MB 5.6 MB/s eta 0:00:01\n",
" ----------------------------------- ---- 29.9/33.5 MB 5.6 MB/s eta 0:00:01\n",
" ------------------------------------ --- 30.9/33.5 MB 5.6 MB/s eta 0:00:01\n",
" -------------------------------------- - 32.0/33.5 MB 5.6 MB/s eta 0:00:01\n",
" --------------------------------------- 33.0/33.5 MB 5.5 MB/s eta 0:00:01\n",
" --------------------------------------- 33.3/33.5 MB 5.5 MB/s eta 0:00:01\n",
" ---------------------------------------- 33.5/33.5 MB 5.4 MB/s eta 0:00:00\n",
"Installing collected packages: en-core-web-md\n",
"Successfully installed en-core-web-md-3.8.0\n",
"\u001b[38;5;2m[+] Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('en_core_web_md')\n"
]
}
],
"source": [
"!python -m spacy download en_core_web_md\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "703acaf0-e703-47ae-b4d2-56cd7236fbd4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc979d1c-2756-41b6-96de-6c76f2bd5f96",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f22536a-3967-486c-a6f7-bd677199800a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "307af48e-a684-4e85-b2df-e963c43ad07c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "bec7b11b-7f3a-4a9e-a568-2e382caaa004",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5cd5ef2-c48f-4bd0-bf42-12865cc77149",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}