diff --git a/answer_padded.npy b/answer_padded.npy deleted file mode 100644 index 58cbb3c..0000000 Binary files a/answer_padded.npy and /dev/null differ diff --git a/broken_dataset.txt b/broken_dataset.txt deleted file mode 100644 index c1a6c52..0000000 --- a/broken_dataset.txt +++ /dev/null @@ -1 +0,0 @@ -Katak mengalami metamorfosis dari telur, berudu, katak muda, hingga katak dewasa.,multiple_choice,Tahapan apakah yang termasuk dalam metamorfosis katak?,Berudu,Telur|Berudu|Pupa|Imago \ No newline at end of file diff --git a/context_padded.npy b/context_padded.npy deleted file mode 100644 index 316842c..0000000 Binary files a/context_padded.npy and /dev/null differ diff --git a/dataset/training_dataset.json b/dataset/training_dataset.json index b0e5b2a..a1a9a90 100644 --- a/dataset/training_dataset.json +++ b/dataset/training_dataset.json @@ -1,7 +1,7 @@ [ { "context": "Albert Einstein adalah fisikawan teoretis kelahiran Jerman yang mengembangkan teori relativitas, salah satu dari dua pilar fisika modern. Karyanya juga dikenal karena pengaruhnya terhadap filosofi ilmu pengetahuan. Ia menerima Penghargaan Nobel dalam Fisika pada tahun 1921 atas jasanya dalam fisika teoretis.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Siapa yang mengembangkan teori relativitas?", @@ -22,7 +22,7 @@ }, { "context": "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Samudra _______ adalah yang terbesar dan terdalam.", @@ -43,7 +43,7 @@ }, { "context": "Proklamasi Kemerdekaan Indonesia dibacakan pada tanggal 17 Agustus 1945 oleh Soekarno dan Mohammad Hatta di Jakarta. Peristiwa ini menandai lahirnya negara Indonesia yang merdeka dari penjajahan.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Proklamasi Kemerdekaan Indonesia terjadi pada tanggal _______.", @@ -69,7 +69,7 @@ }, { "context": "Hukum Newton adalah tiga hukum fisika yang menjadi dasar mekanika klasik. Hukum pertama menyatakan bahwa suatu benda akan tetap diam atau bergerak lurus beraturan kecuali ada gaya luar yang bekerja padanya. Hukum kedua menyatakan bahwa percepatan suatu benda berbanding lurus dengan gaya yang bekerja padanya dan berbanding terbalik dengan massanya. Hukum ketiga menyatakan bahwa setiap aksi memiliki reaksi yang sama besar tetapi berlawanan arah.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Hukum Newton terdiri dari _______ hukum.", @@ -95,7 +95,7 @@ }, { "context": "Budi Utomo adalah organisasi pemuda yang didirikan pada 20 Mei 1908 oleh dr. Wahidin Sudirohusodo dan para mahasiswa STOVIA. Organisasi ini bertujuan untuk meningkatkan pendidikan dan kesejahteraan rakyat Indonesia serta menjadi tonggak awal kebangkitan nasional.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Budi Utomo didirikan pada tanggal _______.", @@ -121,7 +121,7 @@ }, { "context": "Ki Hajar Dewantara adalah pelopor pendidikan di Indonesia dan pendiri Taman Siswa. Ia dikenal dengan semboyannya 'Ing Ngarsa Sung Tuladha, Ing Madya Mangun Karsa, Tut Wuri Handayani', yang menekankan peran guru dalam pendidikan.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Ki Hajar Dewantara mendirikan _______.", @@ -147,7 +147,7 @@ }, { "context": "Teori evolusi dikembangkan oleh Charles Darwin dan dijelaskan dalam bukunya 'On the Origin of Species' yang diterbitkan pada tahun 1859. Teori ini menyatakan bahwa spesies berevolusi melalui seleksi alam, di mana individu dengan karakteristik yang lebih baik memiliki peluang lebih tinggi untuk bertahan hidup dan berkembang biak.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Teori evolusi dikembangkan oleh?", @@ -173,7 +173,7 @@ }, { "context": "BPUPKI (Badan Penyelidik Usaha-Usaha Persiapan Kemerdekaan Indonesia) dibentuk oleh pemerintah Jepang pada 29 April 1945 sebagai bagian dari janji Jepang untuk memberikan kemerdekaan kepada Indonesia. Pembentukan BPUPKI terjadi pada masa Perang Dunia II, ketika Jepang mulai mengalami kekalahan dari Sekutu dan ingin mendapatkan dukungan dari rakyat Indonesia.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Apa kepanjangan dari BPUPKI?", @@ -194,7 +194,7 @@ }, { "context": "Kerajaan Majapahit adalah kerajaan besar Hindu-Buddha yang berpusat di Jawa Timur, berdiri sekitar tahun 1293 hingga 1500 M. Majapahit mencapai puncak kejayaan di bawah pemerintahan Raja Hayam Wuruk dengan patihnya, Gajah Mada.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Kerajaan Majapahit mencapai puncak kejayaannya di bawah raja _______.", @@ -215,7 +215,7 @@ }, { "context": "Kerajaan Sriwijaya adalah kerajaan maritim yang berpusat di Sumatera Selatan dari abad ke-7 hingga abad ke-13. Kerajaan ini menjadi pusat perdagangan dan penyebaran agama Buddha terbesar di Asia Tenggara.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Kerajaan Sriwijaya adalah pusat penyebaran agama _______ terbesar di Asia Tenggara.", @@ -236,7 +236,7 @@ }, { "context": "Candi Borobudur adalah candi Buddha terbesar di dunia yang terletak di Magelang, Jawa Tengah. Dibangun pada abad ke-8 oleh Wangsa Sailendra, candi ini merupakan simbol puncak kebudayaan Buddha di Jawa.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Candi Borobudur dibangun oleh wangsa _______.", @@ -257,7 +257,7 @@ }, { "context": "VOC (Vereenigde Oostindische Compagnie) adalah perusahaan dagang Belanda yang memonopoli perdagangan rempah-rempah di Nusantara dari abad ke-17 hingga awal abad ke-18.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "VOC adalah singkatan dari _______.", @@ -278,7 +278,7 @@ }, { "context": "Pertempuran Surabaya terjadi pada 10 November 1945 antara pasukan Indonesia melawan pasukan sekutu Inggris yang berusaha mengambil alih kota setelah Jepang menyerah dalam Perang Dunia II. Pertempuran ini dikenang sebagai Hari Pahlawan di Indonesia.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Pertempuran Surabaya terjadi pada tanggal _______.", @@ -299,7 +299,7 @@ }, { "context": "Kerajaan Demak adalah kerajaan Islam pertama di Jawa yang berdiri pada akhir abad ke-15. Kerajaan ini terkenal karena penyebaran agama Islam di Jawa melalui Wali Songo.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Kerajaan Demak adalah kerajaan Islam pertama di Jawa.", @@ -314,7 +314,7 @@ }, { "context": "Sumpah Pemuda terjadi pada 28 Oktober 1928, di mana pemuda Indonesia berikrar untuk bersatu dalam satu tanah air, satu bangsa, dan satu bahasa Indonesia.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Sumpah Pemuda menyatakan persatuan dalam satu agama.", @@ -329,7 +329,7 @@ }, { "context": "Gajah Mada adalah seorang patih terkenal dari Kerajaan Majapahit yang berhasil menyatukan sebagian besar wilayah Nusantara melalui politik ekspansinya. Ia terkenal dengan sumpahnya yang disebut Sumpah Palapa.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Gajah Mada berasal dari Kerajaan Majapahit.", @@ -344,7 +344,7 @@ }, { "context": "Kerajaan Aceh mencapai puncak kejayaannya di bawah pemerintahan Sultan Iskandar Muda pada abad ke-17. Aceh menjadi pusat perdagangan dan kebudayaan Islam di wilayah Nusantara.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Siapakah sultan yang membawa Kerajaan Aceh ke puncak kejayaan?", @@ -360,7 +360,7 @@ }, { "context": "Perang Diponegoro berlangsung dari tahun 1825 hingga 1830. Perang ini dipimpin oleh Pangeran Diponegoro melawan pemerintah kolonial Belanda di Jawa Tengah.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Perang Diponegoro berlangsung selama lima tahun.", @@ -375,7 +375,7 @@ }, { "context": "Candi Prambanan adalah candi Hindu terbesar di Indonesia yang terletak di perbatasan antara Yogyakarta dan Jawa Tengah. Dibangun pada abad ke-9, candi ini merupakan peninggalan Kerajaan Mataram Kuno.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Candi Prambanan dibangun pada abad ke-_______.", @@ -385,7 +385,7 @@ }, { "context": "Pangeran Antasari adalah pahlawan nasional Indonesia yang memimpin perlawanan rakyat Kalimantan Selatan terhadap penjajahan Belanda pada abad ke-19.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Pangeran Antasari berasal dari Kalimantan Selatan.", @@ -401,7 +401,7 @@ }, { "context": "Perjanjian Linggarjati adalah perjanjian yang ditandatangani pada 25 Maret 1947 antara Indonesia dengan Belanda. Perjanjian ini mengakui secara de facto Republik Indonesia yang mencakup Jawa, Sumatra, dan Madura.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Perjanjian Linggarjati ditandatangani pada tahun 1947.", @@ -411,7 +411,7 @@ }, { "context": "Raden Adjeng Kartini adalah tokoh emansipasi wanita Indonesia yang lahir pada 21 April 1879. Ia dikenal melalui surat-suratnya yang memperjuangkan hak perempuan untuk memperoleh pendidikan dan kesetaraan.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Tanggal berapa diperingati sebagai Hari Kartini di Indonesia?", @@ -422,7 +422,7 @@ }, { "context": "Kerajaan Kutai merupakan kerajaan Hindu tertua di Indonesia yang berdiri sekitar abad ke-4 di Kalimantan Timur. Bukti keberadaan kerajaan ini ditemukan dalam prasasti Yupa.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Kerajaan Kutai adalah kerajaan Hindu tertua di Indonesia yang ditemukan melalui prasasti _______.", @@ -437,7 +437,7 @@ }, { "context": "Raden Ajeng Kartini merupakan tokoh penting dalam sejarah perjuangan emansipasi wanita Indonesia. Ia lahir di Jepara dan dikenal melalui bukunya berjudul 'Habis Gelap Terbitlah Terang'.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Kartini dikenal sebagai pejuang emansipasi wanita.", @@ -447,7 +447,7 @@ }, { "context": "Ekspedisi Palembang dilakukan oleh VOC pada tahun 1659 untuk menguasai perdagangan lada di Sumatera Selatan. Ekspedisi ini berakhir dengan kemenangan VOC dan penegakan monopoli lada di daerah tersebut.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Ekspedisi Palembang terjadi pada tahun _______.", @@ -462,7 +462,7 @@ }, { "context": "Fotosintesis adalah proses pembuatan makanan oleh tumbuhan hijau menggunakan cahaya matahari, air, dan karbon dioksida yang menghasilkan oksigen sebagai produk sampingan.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Fotosintesis terjadi pada siang hari.", @@ -477,7 +477,7 @@ }, { "context": "Sel adalah unit terkecil kehidupan. Sel memiliki berbagai komponen, seperti membran sel, sitoplasma, dan inti sel (nukleus).", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Apa fungsi utama nukleus dalam sel?", @@ -493,7 +493,7 @@ }, { "context": "DNA (asam deoksiribonukleat) adalah molekul yang menyimpan informasi genetik pada hampir semua makhluk hidup.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "DNA ditemukan di dalam nukleus sel.", @@ -503,7 +503,7 @@ }, { "context": "Enzim adalah protein yang berfungsi sebagai katalisator yang mempercepat reaksi kimia dalam tubuh tanpa ikut bereaksi secara permanen.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Enzim berfungsi sebagai _______ yang mempercepat reaksi kimia.", @@ -518,7 +518,7 @@ }, { "context": "Proses respirasi pada manusia terjadi di dalam mitokondria sel, yang menggunakan oksigen untuk menghasilkan energi dalam bentuk ATP.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Respirasi manusia terjadi tanpa menggunakan oksigen.", @@ -534,7 +534,7 @@ }, { "context": "Kloroplas adalah organel yang ditemukan dalam sel tumbuhan yang berfungsi sebagai tempat berlangsungnya fotosintesis.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Kloroplas hanya ditemukan pada sel tumbuhan.", @@ -544,7 +544,7 @@ }, { "context": "Mutasi adalah perubahan yang terjadi pada materi genetik yang dapat menyebabkan variasi genetik dalam suatu populasi.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Mutasi pada materi genetik dapat menyebabkan apa?", @@ -555,7 +555,7 @@ }, { "context": "Jantung adalah organ vital dalam tubuh manusia yang berfungsi memompa darah ke seluruh tubuh melalui sistem peredaran darah.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Jantung memiliki empat ruang.", @@ -565,7 +565,7 @@ }, { "context": "Hormon insulin dihasilkan oleh pankreas dan berfungsi mengatur kadar gula dalam darah.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Hormon insulin dihasilkan oleh organ _______.", @@ -580,7 +580,7 @@ }, { "context": "Tulang adalah jaringan tubuh manusia yang berfungsi memberi bentuk tubuh, melindungi organ dalam, dan tempat pembentukan sel darah.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Apa fungsi utama tulang pada manusia?", @@ -601,7 +601,7 @@ }, { "context": "Ginjal adalah organ yang berfungsi menyaring limbah dari darah dan membentuk urin.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Ginjal berfungsi menghasilkan hormon insulin.", @@ -611,7 +611,7 @@ }, { "context": "Paru-paru merupakan organ pernapasan yang bertanggung jawab untuk pertukaran oksigen dan karbon dioksida.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Paru-paru berfungsi untuk pertukaran gas yaitu oksigen dan _______.", @@ -621,7 +621,7 @@ }, { "context": "Sistem saraf manusia terdiri dari sistem saraf pusat dan sistem saraf perifer yang berfungsi mengatur koordinasi tubuh dan merespon rangsangan.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Sistem saraf manusia mencakup otak dan sumsum tulang belakang.", @@ -636,7 +636,7 @@ }, { "context": "Kelenjar tiroid adalah organ yang menghasilkan hormon tiroksin, yang penting untuk mengatur metabolisme tubuh.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Kelenjar tiroid terletak di leher.", @@ -646,7 +646,7 @@ }, { "context": "Eritrosit adalah sel darah merah yang berfungsi membawa oksigen ke seluruh jaringan tubuh.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Apa fungsi utama eritrosit dalam tubuh manusia?", @@ -662,7 +662,7 @@ }, { "context": "Limfosit merupakan jenis sel darah putih yang berperan penting dalam sistem kekebalan tubuh.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Limfosit berperan dalam sistem kekebalan tubuh.", @@ -672,7 +672,7 @@ }, { "context": "Protein adalah makromolekul yang terdiri dari rantai asam amino dan berfungsi dalam pertumbuhan, perbaikan jaringan, serta produksi enzim.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Protein terdiri dari rantai molekul apa?", @@ -688,7 +688,7 @@ }, { "context": "VOC (Vereenigde Oostindische Compagnie) adalah perusahaan dagang Belanda yang didirikan pada tahun 1602 dan merupakan salah satu perusahaan multinasional pertama di dunia. VOC memainkan peran penting dalam perdagangan rempah-rempah di Nusantara dan berkontribusi besar terhadap pembentukan sejarah kolonial di Indonesia.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Apa kepanjangan dari VOC?", @@ -714,7 +714,7 @@ }, { "context": "VOC memiliki hak istimewa dari pemerintah Belanda, termasuk hak untuk mendirikan benteng, mengadakan perjanjian dengan penguasa setempat, dan memiliki angkatan perang sendiri.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Apa salah satu hak istimewa VOC?", @@ -740,7 +740,7 @@ }, { "context": "VOC mengalami kebangkrutan pada akhir abad ke-18 akibat korupsi, biaya perang yang tinggi, dan persaingan dengan negara lain dalam perdagangan internasional.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Apa salah satu penyebab kebangkrutan VOC?", @@ -766,7 +766,7 @@ }, { "context": "Pada abad ke-17, VOC menguasai perdagangan rempah-rempah di kepulauan Nusantara dan menerapkan sistem monopoli yang ketat terhadap produk seperti cengkeh, pala, dan lada.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Produk apa yang dimonopoli oleh VOC?", @@ -787,7 +787,7 @@ }, { "context": "VOC memiliki kebijakan yang dikenal sebagai 'Pelayaran Hongi', di mana armada kapal perang mereka digunakan untuk menghancurkan kebun rempah-rempah yang tidak berada di bawah kendali mereka guna mempertahankan harga tetap tinggi.", - "qa_pairs": [ + "question_posibility": [ { "type": "fill_in_the_blank", "question": "Kebijakan VOC yang bertujuan untuk mempertahankan harga rempah-rempah disebut _______.", @@ -797,7 +797,7 @@ }, { "context": "Pada tahun 1619, Jan Pieterszoon Coen menaklukkan Jayakarta dan menggantinya dengan nama Batavia, yang menjadi pusat kekuasaan VOC di Nusantara.", - "qa_pairs": [ + "question_posibility": [ { "type": "true_false", "question": "Batavia didirikan oleh VOC pada tahun 1619 setelah menaklukkan Jayakarta.", @@ -807,7 +807,7 @@ }, { "context": "Selain berdagang, VOC juga memiliki peran dalam politik di Nusantara, dengan sering kali campur tangan dalam urusan kerajaan lokal untuk memastikan kepentingan ekonomi mereka tetap terjaga.", - "qa_pairs": [ + "question_posibility": [ { "type": "multiple_choice", "question": "Bagaimana VOC mempertahankan kepentingan ekonominya di Nusantara?", @@ -820,5 +820,44 @@ "answer": "Menjalin aliansi dan intervensi politik" } ] + }, + { + "context": "Pada uraian berikut, kalian akan mempelajari ruang lingkup bio logi, memahami objek dan permasalahan biologi pada berbagai tingkat organisasi kehidupan, serta peranannya dalam kehidupan. Kalian juga akan mempelajari metode ilmiah dalam biologi dan bagaimana bersikap ilmiah.", + "question_posibility": [] + }, + { + "context": "Sebagai ilmu pengetahuan alam, biologi menghasilkan hukumhukum yang bersifat universal. Artinya, dilakukan di mana saja, oleh siapa saja, serta kapan saja, secara umum akan mendapatkan hasil yang sama. Dengan istilah lain, dapat dikatakan bahwa biologi memberikan hasil yang bersifat objektif. Hasil temuan tersebut tidak dipengaruhi oleh subjektivitas pelaku eksperimen. Biologi memberikan hasil yang benar secara ilmiah.", + "question_posibility": [] + }, + { + "context": "Dalam mempelajari dan mengembangkan ilmu biologi digunakan metode ilmiah. Oleh karena itu, para biolog harus mampu melakukan kerja ilmiah dalam menyelesaikan masalah atau mencari jawaban permasalahan-permasalahan yang dihadapi dalam penelitiannya.", + "question_posibility": [] + }, + { + "context": "Tahapan dalam metode ilmiah adalah menemukan permasalahan, mengajukan hipotesis, melakukan percobaan untuk menguji hipotesis, menarik kesimpulan, dan membuat laporan percobaan.", + "question_posibility": [ + { + "type": "multiple_choice", + "question": "Apa saja tahapan dalam metode ilmiah", + "options": [ + "Mengamati gejala, membuat laporan, menyusun teori, dan melakukan eksperimen", + "Menemukan permasalahan, mengajukan hipotesis, melakukan percobaan, menarik kesimpulan, dan membuat laporan percobaan", + "Menulis laporan, melakukan wawancara, mengumpulkan opini, dan menguji teori", + "Menentukan kesimpulan terlebih dahulu, kemudian mencari data yang mendukung, lalu membuat laporan" + ], + "answer": "menemukan permasalahan, mengajukan hipotesis, melakukan percobaan, menarik kesumpulan dan membuat laporan percobaan" + } + ] + }, + { + "context": "Tubuh tumbuhan terdiri atas berbagai organ, yaitu akar, batang, dan daun. Pada yang dewasa akan terbentuk bunga serta biji. Sebagai organ fotosintesis, daun disusun oleh berbagai jaringan, yaitu jaringan epidermis, jaringan tiang, jaringan bunga karang, jaringan pengangkut, dan jaringan epidermis. Masing-masing jaringan tersebut disusun oleh sel-sel. Jaringan tiang pada daun misalnya, disusun oleh kumpulan sel yang berbentuk seperti tiang.", + "question_posibility": [ + { + "type": "multiple_choice", + "question": "Tubuh tumbuhan Terdirid ari berbagai organ, kecuali", + "options": ["akar", "batang", "daun", "buah"], + "answer": "buah" + } + ] } ] diff --git a/lstm.ipynb b/lstm.ipynb deleted file mode 100644 index faf2f70..0000000 --- a/lstm.ipynb +++ /dev/null @@ -1,447 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-02-05 01:57:25.675154: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n" - ] - }, - { - "data": { - "text/html": [ - "
Model: \"functional\"\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1mModel: \"functional\"\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
-       "┃ Layer (type)         Output Shape          Param #  Connected to      ┃\n",
-       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
-       "│ encoder_inputs      │ (None, None)      │          0 │ -                 │\n",
-       "│ (InputLayer)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ decoder_inputs      │ (None, None)      │          0 │ -                 │\n",
-       "│ (InputLayer)        │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ embedding           │ (None, None, 128) │      1,280 │ encoder_inputs[0… │\n",
-       "│ (Embedding)         │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ not_equal           │ (None, None)      │          0 │ encoder_inputs[0… │\n",
-       "│ (NotEqual)          │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ decoder_embedding   │ (None, None, 128) │      1,024 │ decoder_inputs[0… │\n",
-       "│ (Embedding)         │                   │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ encoder_lstm (LSTM) │ [(None, 256),     │    394,240 │ embedding[0][0],  │\n",
-       "│                     │ (None, 256),      │            │ not_equal[0][0]   │\n",
-       "│                     │ (None, 256)]      │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ decoder_lstm (LSTM) │ [(None, None,     │    394,240 │ decoder_embeddin… │\n",
-       "│                     │ 256), (None,      │            │ encoder_lstm[0][ │\n",
-       "│                     │ 256), (None,      │            │ encoder_lstm[0][ │\n",
-       "│                     │ 256)]             │            │                   │\n",
-       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
-       "│ decoder_dense       │ (None, None, 8)   │      2,056 │ decoder_lstm[0][ │\n",
-       "│ (Dense)             │                   │            │                   │\n",
-       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
-       "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n", - "│ encoder_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", - "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ decoder_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n", - "│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,280\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n", - "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ not_equal │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ encoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n", - "│ (\u001b[38;5;33mNotEqual\u001b[0m) │ │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ decoder_embedding │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │ decoder_inputs[\u001b[38;5;34m0\u001b[0m… │\n", - "│ (\u001b[38;5;33mEmbedding\u001b[0m) │ │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ \u001b[38;5;34m394,240\u001b[0m │ embedding[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m], │\n", - "│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m), │ │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n", - "│ │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)] │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ decoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, │ \u001b[38;5;34m394,240\u001b[0m │ decoder_embeddin… │\n", - "│ │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m, │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", - "│ │ \u001b[38;5;34m256\u001b[0m), (\u001b[38;5;45mNone\u001b[0m, │ │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", - "│ │ \u001b[38;5;34m256\u001b[0m)] │ │ │\n", - "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n", - "│ decoder_dense │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m2,056\u001b[0m │ decoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n", - "│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n", - "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
 Total params: 792,840 (3.02 MB)\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
 Trainable params: 792,840 (3.02 MB)\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m792,840\u001b[0m (3.02 MB)\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
 Non-trainable params: 0 (0.00 B)\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "None\n", - "Epoch 1/10\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-02-05 01:57:27.530017: E tensorflow/core/util/util.cc:131] oneDNN supports DT_BOOL only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.\n", - "2025-02-05 01:57:27.593630: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence\n", - "\t [[{{node IteratorGetNext}}]]\n", - "/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/trainers/epoch_iterator.py:151: UserWarning: Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least `steps_per_epoch * epochs` batches. You may need to use the `.repeat()` function when building your dataset.\n", - " self._interrupted_warning()\n" - ] - }, - { - "ename": "ValueError", - "evalue": "math domain error", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m 113\u001b[0m target_val \u001b[38;5;241m=\u001b[39m decoder_target_data[split_index:]\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;66;03m# 6) Fit the Model\u001b[39;00m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_train\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 121\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 122\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mencoder_val\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_val\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_val\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;66;03m# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\u001b[39;00m\n\u001b[1;32m 127\u001b[0m \n\u001b[1;32m 128\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;66;03m# 7) Evaluate the Model\u001b[39;00m\n\u001b[1;32m 130\u001b[0m \u001b[38;5;66;03m# ==========================\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;66;03m# If you want a quick evaluation on the validation set:\u001b[39;00m\n\u001b[1;32m 132\u001b[0m val_loss, val_accuracy \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mevaluate([encoder_val, decoder_val], target_val)\n", - "File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback..error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 119\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n", - "File \u001b[0;32m/mnt/disc1/code/lstm-quiz/.venv/lib64/python3.10/site-packages/keras/src/utils/progbar.py:119\u001b[0m, in \u001b[0;36mProgbar.update\u001b[0;34m(self, current, values, finalize)\u001b[0m\n\u001b[1;32m 116\u001b[0m message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 119\u001b[0m numdigits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(\u001b[43mmath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlog10\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 120\u001b[0m bar \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(numdigits) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124md/\u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m%\u001b[39m (current, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget)\n\u001b[1;32m 121\u001b[0m bar \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[1m\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbar\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\x1b\u001b[39;00m\u001b[38;5;124m[0m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "\u001b[0;31mValueError\u001b[0m: math domain error" - ] - } - ], - "source": [ - "# ==========================\n", - "# 1) Install/Import Dependencies\n", - "# ==========================\n", - "# If you are in a brand new environment, uncomment the following line:\n", - "# %pip install tensorflow pandas\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import tensorflow as tf\n", - "from tensorflow.keras.preprocessing.text import Tokenizer\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", - "from tensorflow.keras.layers import Input, LSTM, Embedding, Dense\n", - "from tensorflow.keras.models import Model\n", - "\n", - "# ==========================\n", - "# 2) Load Dataset (CSV)\n", - "# ==========================\n", - "# Adjust the file path to your CSV file\n", - "df = pd.read_csv(\"quiz_questions.csv\")\n", - "\n", - "# Extract the paragraphs and questions\n", - "paragraphs = df['paragraph'].astype(str).tolist()\n", - "questions = df['question'].astype(str).tolist()\n", - "\n", - "# (Optional) For demonstration, let's ignore question_type, answer, distractors in this example\n", - "# but you can incorporate them as extra signals if you wish.\n", - "\n", - "# ==========================\n", - "# 3) Tokenize Text\n", - "# ==========================\n", - "# Create two tokenizers: one for paragraphs, one for questions\n", - "num_words = 10000 # Maximum vocabulary size\n", - "\n", - "tokenizer_paragraph = Tokenizer(num_words=num_words, oov_token=\"\")\n", - "tokenizer_paragraph.fit_on_texts(paragraphs)\n", - "paragraph_sequences = tokenizer_paragraph.texts_to_sequences(paragraphs)\n", - "\n", - "tokenizer_question = Tokenizer(num_words=num_words, oov_token=\"\")\n", - "tokenizer_question.fit_on_texts(questions)\n", - "question_sequences = tokenizer_question.texts_to_sequences(questions)\n", - "\n", - "# Get max lengths (for padding)\n", - "max_paragraph_len = max(len(seq) for seq in paragraph_sequences)\n", - "max_question_len = max(len(seq) for seq in question_sequences)\n", - "\n", - "# Pad sequences\n", - "encoder_input_data = pad_sequences(paragraph_sequences, maxlen=max_paragraph_len, padding='post')\n", - "# For decoder data, we usually do teacher forcing:\n", - "# We'll keep one version as input, one version shifted as the target\n", - "decoder_input_data_full = pad_sequences(question_sequences, maxlen=max_question_len, padding='post')\n", - "\n", - "# We create decoder_target_data by shifting to the left by 1 token\n", - "decoder_target_data = np.copy(decoder_input_data_full[:, 1:])\n", - "decoder_input_data = np.copy(decoder_input_data_full[:, :-1])\n", - "\n", - "# Expand target dimension for sparse_categorical_crossentropy\n", - "decoder_target_data = np.expand_dims(decoder_target_data, -1)\n", - "\n", - "# Calculate vocab sizes\n", - "vocab_size_paragraph = min(len(tokenizer_paragraph.word_index) + 1, num_words)\n", - "vocab_size_question = min(len(tokenizer_question.word_index) + 1, num_words)\n", - "\n", - "# ==========================\n", - "# 4) Build Seq2Seq Model\n", - "# ==========================\n", - "embedding_dim = 128\n", - "latent_dim = 256 # LSTM hidden dimension\n", - "\n", - "# ----- Encoder -----\n", - "encoder_inputs = Input(shape=(None,), name=\"encoder_inputs\")\n", - "encoder_embedding = Embedding(input_dim=vocab_size_paragraph,\n", - " output_dim=embedding_dim,\n", - " mask_zero=True)(encoder_inputs)\n", - "\n", - "encoder_lstm = LSTM(latent_dim, return_state=True, name=\"encoder_lstm\")\n", - "_, state_h, state_c = encoder_lstm(encoder_embedding)\n", - "\n", - "encoder_states = [state_h, state_c]\n", - "\n", - "# ----- Decoder -----\n", - "decoder_inputs = Input(shape=(None,), name=\"decoder_inputs\")\n", - "decoder_embedding = Embedding(input_dim=vocab_size_question,\n", - " output_dim=embedding_dim,\n", - " mask_zero=True,\n", - " name=\"decoder_embedding\")(decoder_inputs)\n", - "\n", - "decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name=\"decoder_lstm\")\n", - "decoder_outputs, _, _ = decoder_lstm(decoder_embedding,\n", - " initial_state=encoder_states)\n", - "decoder_dense = Dense(vocab_size_question, activation='softmax', name=\"decoder_dense\")\n", - "decoder_outputs = decoder_dense(decoder_outputs)\n", - "\n", - "# Combine into a training model\n", - "model = Model([encoder_inputs, decoder_inputs], decoder_outputs)\n", - "model.compile(optimizer='adam',\n", - " loss='sparse_categorical_crossentropy',\n", - " metrics=['sparse_categorical_accuracy'])\n", - "\n", - "print(model.summary())\n", - "\n", - "# ==========================\n", - "# 5) Train/Test Split (Optional)\n", - "# ==========================\n", - "# For simplicity, let's do a quick train/validation split\n", - "# Adjust split size or do a separate test set for production usage.\n", - "split_index = int(0.8 * len(encoder_input_data))\n", - "encoder_train = encoder_input_data[:split_index]\n", - "decoder_train = decoder_input_data[:split_index]\n", - "target_train = decoder_target_data[:split_index]\n", - "\n", - "encoder_val = encoder_input_data[split_index:]\n", - "decoder_val = decoder_input_data[split_index:]\n", - "target_val = decoder_target_data[split_index:]\n", - "\n", - "# ==========================\n", - "# 6) Fit the Model\n", - "# ==========================\n", - "history = model.fit(\n", - " [encoder_train, decoder_train],\n", - " target_train,\n", - " batch_size=32,\n", - " epochs=10,\n", - " validation_data=([encoder_val, decoder_val], target_val)\n", - ")\n", - "\n", - "# The accuracy reported is \"sparse_categorical_accuracy\" at the token level.\n", - "\n", - "# ==========================\n", - "# 7) Evaluate the Model\n", - "# ==========================\n", - "# If you want a quick evaluation on the validation set:\n", - "val_loss, val_accuracy = model.evaluate([encoder_val, decoder_val], target_val)\n", - "print(f\"Validation Loss: {val_loss:.4f}\")\n", - "print(f\"Validation Accuracy (token-level): {val_accuracy:.4f}\")\n", - "\n", - "# ==========================\n", - "# 8) Build Inference Models\n", - "# ==========================\n", - "# Encoder model for inference\n", - "encoder_model_inf = Model(encoder_inputs, encoder_states)\n", - "\n", - "# Decoder model for inference\n", - "decoder_state_input_h = Input(shape=(latent_dim,), name=\"inference_state_h\")\n", - "decoder_state_input_c = Input(shape=(latent_dim,), name=\"inference_state_c\")\n", - "decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n", - "\n", - "dec_emb_inf = decoder_embedding(decoder_inputs)\n", - "decoder_inf_outputs, state_h_inf, state_c_inf = decoder_lstm(\n", - " dec_emb_inf, initial_state=decoder_states_inputs\n", - ")\n", - "decoder_inf_states = [state_h_inf, state_c_inf]\n", - "decoder_inf_outputs = decoder_dense(decoder_inf_outputs)\n", - "\n", - "decoder_model_inf = Model(\n", - " [decoder_inputs] + decoder_states_inputs,\n", - " [decoder_inf_outputs] + decoder_inf_states\n", - ")\n", - "\n", - "# Create index-to-word mapping for the question tokenizer\n", - "index_to_word_question = {idx: word for word, idx in tokenizer_question.word_index.items()}\n", - "# If you used an OOV token, might want to handle that as well.\n", - "\n", - "def generate_question(paragraph_text, max_length=50, start_token=None, end_token=None):\n", - " \"\"\"\n", - " Generate a question from a paragraph using the trained seq2seq model.\n", - " Token-level decoding with greedy search.\n", - " \"\"\"\n", - " # 1) Encode the paragraph\n", - " seq = tokenizer_paragraph.texts_to_sequences([paragraph_text])\n", - " seq = pad_sequences(seq, maxlen=max_paragraph_len, padding='post')\n", - " states_value = encoder_model_inf.predict(seq)\n", - "\n", - " # 2) Start token\n", - " target_seq = np.zeros((1, 1), dtype='int32')\n", - " # If you have a token, set it here\n", - " # e.g., target_seq[0, 0] = tokenizer_question.word_index[\"\"]\n", - "\n", - " decoded_words = []\n", - "\n", - " for _ in range(max_length):\n", - " output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)\n", - "\n", - " sampled_token_index = np.argmax(output_tokens[0, -1, :])\n", - " sampled_word = index_to_word_question.get(sampled_token_index, '')\n", - "\n", - " # Stop if we encounter an token or a special index\n", - " if end_token and (sampled_word == end_token):\n", - " break\n", - "\n", - " decoded_words.append(sampled_word)\n", - "\n", - " # Next target\n", - " target_seq = np.zeros((1, 1), dtype='int32')\n", - " target_seq[0, 0] = sampled_token_index\n", - "\n", - " states_value = [h, c]\n", - "\n", - " return ' '.join(decoded_words)\n", - "\n", - "# ==========================\n", - "# 9) Test Inference on a Paragraph\n", - "# ==========================\n", - "test_paragraph = \"Albert Einstein was a theoretical physicist born in Germany...\"\n", - "generated = generate_question(test_paragraph)\n", - "print(\"Generated question:\", generated)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "yups 0\n", - "yups 1\n", - "yups 2\n", - "yups 3\n", - "yups 4\n", - "yups 5\n", - "yups 6\n", - "yups 7\n", - "yups 8\n", - "yups 9\n", - "yups 10\n", - "yups 11\n", - "yups 12\n", - "yups 13\n", - "yups 14\n", - "yups 15\n", - "yups 16\n", - "yups 17\n", - "yups 18\n", - "yups 19\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/lstm_multi_output_model.h5 b/lstm_multi_output_model.h5 deleted file mode 100644 index a8248b4..0000000 Binary files a/lstm_multi_output_model.h5 and /dev/null differ diff --git a/lstm_multi_output_model.keras b/lstm_multi_output_model.keras index ad43383..00a6876 100644 Binary files a/lstm_multi_output_model.keras and b/lstm_multi_output_model.keras differ diff --git a/lstm_question_generator.keras b/lstm_question_generator.keras deleted file mode 100644 index 7aae8fa..0000000 Binary files a/lstm_question_generator.keras and /dev/null differ diff --git a/main.py b/main.py deleted file mode 100644 index 32a764b..0000000 --- a/main.py +++ /dev/null @@ -1,123 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.model_selection import train_test_split -import tensorflow as tf -import pickle -from tensorflow.keras.preprocessing.text import Tokenizer -from tensorflow.keras.preprocessing.sequence import pad_sequences -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import ( - LSTM, - Embedding, - Dense, - SpatialDropout1D, - TimeDistributed, -) -from tensorflow.keras.optimizers import Adam - - -# 1. Load dataset - -df = pd.read_csv("quiz_questions.csv") - -# Pastikan kolom 'paragraph' dan 'question' ada dan tidak kosong -df.dropna(subset=["paragraph", "question"], inplace=True) - - -# 2. Preprocessing text - - -def preprocess_text(text): - # Contoh preprocessing sederhana - text = text.lower() - return text - - -df["paragraph"] = df["paragraph"].astype(str).apply(preprocess_text) -df["question"] = df["question"].astype(str).apply(preprocess_text) - - -# 3. Tokenization - -# Gabung semua teks (paragraph+question) agar vocabulary mencakup kata2 di keduanya -tokenizer = Tokenizer() -tokenizer.fit_on_texts(df["paragraph"].tolist() + df["question"].tolist()) -vocab_size = len(tokenizer.word_index) + 1 # +1 karena index dimulai dari 1 - -# Konversi teks menjadi sequences -X_sequences = tokenizer.texts_to_sequences(df["paragraph"]) -y_sequences = tokenizer.texts_to_sequences(df["question"]) - -# Cari panjang sequence maksimal (agar uniform untuk padding) -max_len_paragraph = max(len(seq) for seq in X_sequences) -max_len_question = max(len(seq) for seq in y_sequences) -max_length = max(max_len_paragraph, max_len_question) - -# Padding sequences (panjangnya disamakan => max_length) -X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post") -y_padded = pad_sequences(y_sequences, maxlen=max_length, padding="post") - -with open("tokenizer.pkl", "wb") as f: - pickle.dump(tokenizer, f) -print("Tokenizer disimpan ke tokenizer.pkl") - - -# 4. Siapkan X, y - -# Untuk sequence-to-sequence dengan "sparse_categorical_crossentropy", -# idealnya y memiliki shape: (num_samples, max_length, 1) -X = np.array(X_padded) -y = np.expand_dims(np.array(y_padded), axis=-1) - -print("Shape X:", X.shape) -print("Shape y:", y.shape) # (batch_size, max_length, 1) - - -# 5. Split data - -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 -) - -print("Train size:", X_train.shape, y_train.shape) -print("Test size: ", X_test.shape, y_test.shape) - - -# 6. Build Model LSTM - -# Kita pakai 2 LSTM stack, masing2 return_sequences=True -# Supaya output akhirnya tetap "sequence" (batch_size, max_length, hidden_dim) -model = Sequential() -model.add(Embedding(input_dim=vocab_size, output_dim=128)) -model.add(SpatialDropout1D(0.2)) - -model.add(LSTM(128, return_sequences=True)) -model.add(LSTM(128, return_sequences=True)) - -# TimeDistributed Dense agar Dense diaplikasikan per timestep -model.add(TimeDistributed(Dense(vocab_size, activation="softmax"))) - - -# 7. Compile - -model.compile( - loss="sparse_categorical_crossentropy", - optimizer=Adam(learning_rate=0.001), - metrics=["accuracy"], -) - -model.summary() - - -# 8. Train Model - -epochs = 10 -history = model.fit( - X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), batch_size=32 -) - - -# 9. Save Model - -model.save("lstm_question_generator.keras") -print("Training selesai dan model telah disimpan.") diff --git a/normalize_text/normalize.json b/normalize_text/normalize.json new file mode 100644 index 0000000..2abea5a --- /dev/null +++ b/normalize_text/normalize.json @@ -0,0 +1,65 @@ +{ + "yg": "yang", + "gokil": "kocak", + "kalo": "kalau", + "gue": "saya", + "elo": "kamu", + "nih": "ini", + "trs": "terus", + "tdk": "tidak", + "gmna": "bagaimana", + "tp": "tapi", + "jd": "jadi", + "aja": "saja", + "krn": "karena", + "blm": "belum", + "dgn": "dengan", + "skrg": "sekarang", + "msh": "masih", + "lg": "lagi", + "sy": "saya", + "sm": "sama", + "bgt": "banget", + "dr": "dari", + "kpn": "kapan", + "hrs": "harus", + "cm": "cuma", + "sbnrnya": "sebenarnya", + "tdr": "tidur", + "kl": "kalau", + "org": "orang", + "pke": "pakai", + "prnh": "pernah", + "brgkt": "berangkat", + "pdhl": "padahal", + "btw": "ngomong-ngomong", + "dmn": "di mana", + "bsk": "besok", + "td": "tadi", + "dlm": "dalam", + "utk": "untuk", + "spt": "seperti", + "gpp": "tidak apa-apa", + "bs": "bisa", + "jg": "juga", + "dg": "dengan", + "klw": "kalau", + "wkwk": "haha", + "cpt": "cepat", + "knp": "kenapa", + "jgk": "juga", + "plg": "pulang", + "brp": "berapa", + "bkn": "bukan", + "mnt": "minta", + "udh": "sudah", + "sdh": "sudah", + "brkt": "berangkat", + "sprt": "seperti", + "jgn": "jangan", + "mlm": "malam", + "sblm": "sebelum", + "stlh": "setelah", + "mlh": "malah", + "tmn": "teman" +} diff --git a/online mcqs.ipynb b/online mcqs.ipynb new file mode 100644 index 0000000..03a7ac0 --- /dev/null +++ b/online mcqs.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c9142fcb-39a6-42cb-a38c-629ca17c5ac6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-03-17 14:50:32.718599: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2025-03-17 14:50:32.718943: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2025-03-17 14:50:32.721006: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2025-03-17 14:50:32.727572: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1742197832.738194 22019 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "E0000 00:00:1742197832.741303 22019 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2025-03-17 14:50:32.752422: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + }, + { + "ename": "OSError", + "evalue": "[E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Load spaCy model with word vectors\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m nlp \u001b[38;5;241m=\u001b[39m \u001b[43mspacy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43men_core_web_md\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# Function to preprocess text\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpreprocess_text\u001b[39m(text):\n", + "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/__init__.py:51\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 28\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 29\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 34\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 35\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mutil\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mvocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvocab\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43menable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43menable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexclude\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/spacy/util.py:472\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 472\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n", + "\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory." + ] + } + ], + "source": [ + "import numpy as np\n", + "import tensorflow as tf\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n", + "import spacy\n", + "import random\n", + "\n", + "# Load spaCy model with word vectors\n", + "nlp = spacy.load(\"en_core_web_md\") # Use \"en_core_web_md\" or \"en_core_web_lg\" for word vectors\n", + "\n", + "# Function to preprocess text\n", + "def preprocess_text(text):\n", + " doc = nlp(text)\n", + " sentences = [sent.text for sent in doc.sents]\n", + " return sentences\n", + "\n", + "# Function to create training data for LSTM\n", + "def create_training_data(sentences, tokenizer, max_length):\n", + " sequences = tokenizer.texts_to_sequences(sentences)\n", + " padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')\n", + " return padded_sequences\n", + "\n", + "# LSTM Model for learning sentence structures\n", + "def build_lstm_model(vocab_size, max_length, embedding_dim):\n", + " model = Sequential([\n", + " Embedding(vocab_size, embedding_dim, input_length=max_length),\n", + " LSTM(128, return_sequences=True),\n", + " Dropout(0.2),\n", + " LSTM(64),\n", + " Dense(64, activation='relu'),\n", + " Dense(vocab_size, activation='softmax')\n", + " ])\n", + " model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", + " return model\n", + "\n", + "# Function to find similar words using spaCy\n", + "def find_similar_words(word, num_similar=3):\n", + " word_token = nlp.vocab[word] if word in nlp.vocab else None\n", + " if not word_token or not word_token.has_vector:\n", + " return [\"[Distractor]\"] * num_similar # Return placeholders if no vector is found\n", + "\n", + " # Compute similarity with other words in vocab\n", + " similarities = []\n", + " for token in nlp.vocab:\n", + " if token.is_alpha and token.has_vector and token != word_token:\n", + " similarity = word_token.similarity(token)\n", + " similarities.append((token.text, similarity))\n", + " \n", + " # Sort and return top similar words\n", + " similarities.sort(key=lambda x: x[1], reverse=True)\n", + " return [word for word, _ in similarities[:num_similar]]\n", + "\n", + "# Function to generate MCQs using LSTM and spaCy word embeddings\n", + "def generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=5):\n", + " sentences = preprocess_text(text)\n", + " selected_sentences = random.sample(sentences, min(num_questions, len(sentences)))\n", + "\n", + " mcqs = []\n", + " for sentence in selected_sentences:\n", + " doc = nlp(sentence)\n", + " nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n", + " if len(nouns) < 1:\n", + " continue\n", + "\n", + " subject = random.choice(nouns)\n", + " question_stem = sentence.replace(subject, \"______\")\n", + "\n", + " # Generate similar words using spaCy\n", + " similar_words = find_similar_words(subject, num_similar=3)\n", + "\n", + " answer_choices = [subject] + similar_words\n", + " random.shuffle(answer_choices)\n", + " correct_answer = chr(65 + answer_choices.index(subject))\n", + "\n", + " mcqs.append((question_stem, answer_choices, correct_answer))\n", + "\n", + " return mcqs\n", + "\n", + "# Example usage\n", + "text = \"\"\"Deep learning is a subset of machine learning that uses neural networks. LSTMs are useful for processing sequential data like text. \n", + "Natural language processing involves techniques like tokenization and named entity recognition.\"\"\"\n", + "\n", + "# Tokenizer setup\n", + "tokenizer = Tokenizer()\n", + "tokenizer.fit_on_texts(preprocess_text(text))\n", + "vocab_size = len(tokenizer.word_index) + 1\n", + "max_length = 20\n", + "\n", + "# Train LSTM model (Note: Training requires large datasets)\n", + "model = build_lstm_model(vocab_size, max_length, embedding_dim=100)\n", + "\n", + "# Generate MCQs\n", + "mcqs = generate_mcqs_lstm(text, tokenizer, max_length, model, num_questions=3)\n", + "for i, (q, choices, ans) in enumerate(mcqs, 1):\n", + " print(f\"Q{i}: {q}\")\n", + " print(f\" A) {choices[0]} B) {choices[1]} C) {choices[2]} D) {choices[3]}\")\n", + " print(f\"Correct Answer: {ans}\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62aae7fc-b921-4439-8396-62d7fd8d25d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting en-core-web-md==3.8.0\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)\n", + " ---------------------------------------- 0.0/33.5 MB ? eta -:--:--\n", + " --------------------------------------- 0.5/33.5 MB 4.2 MB/s eta 0:00:08\n", + " -- ------------------------------------- 1.8/33.5 MB 5.6 MB/s eta 0:00:06\n", + " --- ------------------------------------ 3.1/33.5 MB 5.8 MB/s eta 0:00:06\n", + " ----- ---------------------------------- 4.2/33.5 MB 5.9 MB/s eta 0:00:05\n", + " ------ --------------------------------- 5.2/33.5 MB 5.5 MB/s eta 0:00:06\n", + " ------- -------------------------------- 6.6/33.5 MB 5.6 MB/s eta 0:00:05\n", + " --------- ------------------------------ 7.6/33.5 MB 5.6 MB/s eta 0:00:05\n", + " ---------- ----------------------------- 8.4/33.5 MB 5.4 MB/s eta 0:00:05\n", + " ----------- ---------------------------- 9.7/33.5 MB 5.5 MB/s eta 0:00:05\n", + " ------------ --------------------------- 10.7/33.5 MB 5.5 MB/s eta 0:00:05\n", + " -------------- ------------------------- 12.1/33.5 MB 5.5 MB/s eta 0:00:04\n", + " --------------- ------------------------ 13.1/33.5 MB 5.5 MB/s eta 0:00:04\n", + " ---------------- ----------------------- 14.2/33.5 MB 5.5 MB/s eta 0:00:04\n", + " ------------------ --------------------- 15.2/33.5 MB 5.4 MB/s eta 0:00:04\n", + " ------------------- -------------------- 16.3/33.5 MB 5.4 MB/s eta 0:00:04\n", + " -------------------- ------------------- 17.6/33.5 MB 5.4 MB/s eta 0:00:03\n", + " ---------------------- ----------------- 18.9/33.5 MB 5.5 MB/s eta 0:00:03\n", + " ------------------------ --------------- 20.2/33.5 MB 5.5 MB/s eta 0:00:03\n", + " ------------------------- -------------- 21.8/33.5 MB 5.6 MB/s eta 0:00:03\n", + " --------------------------- ------------ 23.1/33.5 MB 5.6 MB/s eta 0:00:02\n", + " ---------------------------- ----------- 24.1/33.5 MB 5.7 MB/s eta 0:00:02\n", + " ------------------------------ --------- 25.4/33.5 MB 5.7 MB/s eta 0:00:02\n", + " ------------------------------- -------- 26.5/33.5 MB 5.6 MB/s eta 0:00:02\n", + " -------------------------------- ------- 27.5/33.5 MB 5.6 MB/s eta 0:00:02\n", + " ---------------------------------- ----- 28.8/33.5 MB 5.6 MB/s eta 0:00:01\n", + " ----------------------------------- ---- 29.9/33.5 MB 5.6 MB/s eta 0:00:01\n", + " ------------------------------------ --- 30.9/33.5 MB 5.6 MB/s eta 0:00:01\n", + " -------------------------------------- - 32.0/33.5 MB 5.6 MB/s eta 0:00:01\n", + " --------------------------------------- 33.0/33.5 MB 5.5 MB/s eta 0:00:01\n", + " --------------------------------------- 33.3/33.5 MB 5.5 MB/s eta 0:00:01\n", + " ---------------------------------------- 33.5/33.5 MB 5.4 MB/s eta 0:00:00\n", + "Installing collected packages: en-core-web-md\n", + "Successfully installed en-core-web-md-3.8.0\n", + "\u001b[38;5;2m[+] Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_md')\n" + ] + } + ], + "source": [ + "!python -m spacy download en_core_web_md\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "703acaf0-e703-47ae-b4d2-56cd7236fbd4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc979d1c-2756-41b6-96de-6c76f2bd5f96", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f22536a-3967-486c-a6f7-bd677199800a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "307af48e-a684-4e85-b2df-e963c43ad07c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec7b11b-7f3a-4a9e-a568-2e382caaa004", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5cd5ef2-c48f-4bd0-bf42-12865cc77149", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/question_padded.npy b/question_padded.npy deleted file mode 100644 index b8bad3c..0000000 Binary files a/question_padded.npy and /dev/null differ diff --git a/question_type_labels.npy b/question_type_labels.npy deleted file mode 100644 index 6db1732..0000000 Binary files a/question_type_labels.npy and /dev/null differ diff --git a/tokenizer.pkl b/tokenizer.pkl index e793ef0..8e5c4a3 100644 Binary files a/tokenizer.pkl and b/tokenizer.pkl differ diff --git a/training_model.ipynb b/training_model.ipynb index de09828..23827d8 100644 --- a/training_model.ipynb +++ b/training_model.ipynb @@ -2,46 +2,35 @@ "cells": [ { "cell_type": "code", - "execution_count": 22, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ - "# import library\n", "\n", - "# Data manipulation and visualization\n", - "import pandas as pd\n", "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import json\n", - "\n", - "# Natural language processing\n", "import re\n", "import string\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", - "from nltk.stem import WordNetLemmatizer\n", + "import matplotlib.pyplot as plt\n", "\n", - "# Deep learning\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Model\n", - "from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate\n", - "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", - "\n", + "from tensorflow.keras.layers import Input, Embedding, LSTM, Dense\n", "\n", + "from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n", "from sklearn.model_selection import train_test_split\n", + "import pickle\n", "\n", - "# Metrics for model evaluation\n", - "from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score\n", - "\n", - "# Utility for serialization\n", - "import pickle\n" + "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score\n", + "import nltk" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 113, "metadata": {}, "outputs": [ { @@ -64,7 +53,7 @@ "True" ] }, - "execution_count": 23, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -79,131 +68,69 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " context \\\n", - "0 Albert Einstein adalah fisikawan teoretis kela... \n", - "1 Samudra Pasifik adalah yang terbesar dan terda... \n", - "2 Proklamasi Kemerdekaan Indonesia dibacakan pad... \n", - "3 Hukum Newton adalah tiga hukum fisika yang men... \n", - "4 Budi Utomo adalah organisasi pemuda yang didir... \n", - "\n", - " qa_pairs \n", - "0 [{'type': 'fill_in_the_blank', 'question': 'Si... \n", - "1 [{'type': 'fill_in_the_blank', 'question': 'Sa... \n", - "2 [{'type': 'fill_in_the_blank', 'question': 'Pr... \n", - "3 [{'type': 'fill_in_the_blank', 'question': 'Hu... \n", - "4 [{'type': 'fill_in_the_blank', 'question': 'Bu... \n", - "\n", - "Total Context: 49\n", - "Total QA Pairs: 95\n" + "Total Context: 54\n", + "Total Possibility Questions: 97\n", + "Total Fill in the Blank Questions: 24\n", + "Total Multiple Choice Questions: 29\n", + "Total True/False Questions: 44\n" ] } ], "source": [ - "# load dataset\n", - "df = pd.read_json(\"dataset/training_dataset.json\")\n", - "print(df.head())\n", - "with open(\"dataset/training_dataset.json\", \"r\", encoding=\"utf-8\") as file:\n", + "import json\n", + "from collections import defaultdict\n", + "\n", + "# path dataset\n", + "file_path = \"dataset/training_dataset.json\"\n", + "\n", + "with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " dataset = json.load(file)\n", - " \n", - " \n", - "# Menghitung total context\n", + "\n", "total_context = len(dataset)\n", "\n", - "# Menghitung total qa_pairs\n", - "total_qa_pairs = sum(len(entry[\"qa_pairs\"]) for entry in dataset)\n", + "total_question_posibility = sum(len(entry[\"question_posibility\"]) for entry in dataset)\n", "\n", - "# Menampilkan hasil\n", - "print(f\"\\nTotal Context: {total_context}\")\n", - "print(f\"Total QA Pairs: {total_qa_pairs}\")" + "question_type_counts = defaultdict(int)\n", + "for entry in dataset:\n", + " for question in entry[\"question_posibility\"]:\n", + " question_type_counts[question[\"type\"]] += 1\n", + "\n", + "print(f\"Total Context: {total_context}\")\n", + "print(f\"Total Possibility Questions: {total_question_posibility}\")\n", + "print(f\"Total Fill in the Blank Questions: {question_type_counts.get('fill_in_the_blank', 0)}\")\n", + "print(f\"Total Multiple Choice Questions: {question_type_counts.get('multiple_choice', 0)}\")\n", + "print(f\"Total True/False Questions: {question_type_counts.get('true_false', 0)}\")\n" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 115, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data processing complete!\n", + "Samples: 97\n" + ] + } + ], "source": [ "# Text Preprocessing\n", "stop_words = set(stopwords.words(\"indonesian\")) \n", - "lemmatizer = WordNetLemmatizer()\n", + "factory = StemmerFactory()\n", + "stemmer = factory.create_stemmer()\n", "\n", - "normalization_dict = {\n", - " \"yg\": \"yang\",\n", - " \"gokil\": \"kocak\",\n", - " \"kalo\": \"kalau\",\n", - " \"gue\": \"saya\",\n", - " \"elo\": \"kamu\",\n", - " \"nih\": \"ini\",\n", - " \"trs\": \"terus\",\n", - " \"tdk\": \"tidak\",\n", - " \"gmna\": \"bagaimana\",\n", - " \"tp\": \"tapi\",\n", - " \"jd\": \"jadi\",\n", - " \"aja\": \"saja\",\n", - " \"krn\": \"karena\",\n", - " \"blm\": \"belum\",\n", - " \"dgn\": \"dengan\",\n", - " \"skrg\": \"sekarang\",\n", - " \"msh\": \"masih\",\n", - " \"lg\": \"lagi\",\n", - " \"sy\": \"saya\",\n", - " \"sm\": \"sama\",\n", - " \"bgt\": \"banget\",\n", - " \"dr\": \"dari\",\n", - " \"kpn\": \"kapan\",\n", - " \"hrs\": \"harus\",\n", - " \"cm\": \"cuma\",\n", - " \"sbnrnya\": \"sebenarnya\",\n", - " \"tdr\": \"tidur\",\n", - " \"tdk\": \"tidak\",\n", - " \"kl\": \"kalau\",\n", - " \"org\": \"orang\",\n", - " \"pke\": \"pakai\",\n", - " \"prnh\": \"pernah\",\n", - " \"brgkt\": \"berangkat\",\n", - " \"pdhl\": \"padahal\",\n", - " \"btw\": \"ngomong-ngomong\",\n", - " \"dmn\": \"di mana\",\n", - " \"bsk\": \"besok\",\n", - " \"td\": \"tadi\",\n", - " \"dlm\": \"dalam\",\n", - " \"utk\": \"untuk\",\n", - " \"spt\": \"seperti\",\n", - " \"gpp\": \"tidak apa-apa\",\n", - " \"bs\": \"bisa\",\n", - " \"jg\": \"juga\",\n", - " \"tp\": \"tapi\",\n", - " \"dg\": \"dengan\",\n", - " \"klw\": \"kalau\",\n", - " \"wkwk\": \"haha\",\n", - " \"cpt\": \"cepat\",\n", - " \"knp\": \"kenapa\",\n", - " \"jgk\": \"juga\",\n", - " \"plg\": \"pulang\",\n", - " \"brp\": \"berapa\",\n", - " \"bkn\": \"bukan\",\n", - " \"mnt\": \"minta\",\n", - " \"udh\": \"sudah\",\n", - " \"sdh\": \"sudah\",\n", - " \"brkt\": \"berangkat\",\n", - " \"btw\": \"by the way\",\n", - " \"tdk\": \"tidak\",\n", - " \"sprt\": \"seperti\",\n", - " \"jgn\": \"jangan\",\n", - " \"mlm\": \"malam\",\n", - " \"sblm\": \"sebelum\",\n", - " \"stlh\": \"setelah\",\n", - " \"tdr\": \"tidur\",\n", - " \"mlh\": \"malah\",\n", - " \"tmn\": \"teman\",\n", - "}\n", + "with open(\"normalize_text/normalize.json\", \"r\", encoding=\"utf-8\") as file:\n", + " normalization_dict = json.load(file)\n", "\n", "\n", "def text_preprocessing(text):\n", @@ -219,117 +146,98 @@ " # word tokenize \n", " tokens = word_tokenize(text)\n", " \n", + " \n", " # normalassi kata\n", " tokens = [normalization_dict[word] if word in normalization_dict else word for word in tokens] \n", " \n", " \n", - " # lemmatization\n", - " tokens = [lemmatizer.lemmatize(word) for word in tokens] \n", + " # Lemmatization using Sastrawi (stemming in Indonesian)\n", + " tokens = [stemmer.stem(word) for word in tokens]\n", " \n", " # stopword removal\n", " tokens = [word for word in tokens if word not in stop_words] \n", " \n", - " return tokens\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Data processing complete!\n", - "Samples: 95\n" - ] - } - ], - "source": [ - "# with open(\"dataset/training_dataset.json\", \"r\", encoding=\"utf-8\") as file:\n", - "# dataset = json.load(file)\n", + " return tokens\n", "\n", - "# === Extract Data so that each QA pair has its own context === #\n", + "# text processing all data training\n", "contexts = []\n", "questions = []\n", - "answers = []\n", + "correct_answers = []\n", + "wrong_answers = []\n", "question_types = []\n", "\n", "for entry in dataset:\n", " processed_context = text_preprocessing(entry[\"context\"])\n", - " for qa in entry[\"qa_pairs\"]:\n", + " \n", + " for qa in entry[\"question_posibility\"]:\n", + " processed_question = text_preprocessing(qa[\"question\"])\n", + " processed_answer = text_preprocessing(qa[\"answer\"])\n", + " \n", " contexts.append(processed_context)\n", - " questions.append(text_preprocessing(qa[\"question\"]))\n", - " answers.append(text_preprocessing(qa[\"answer\"]))\n", + " questions.append(processed_question)\n", + " correct_answers.append(processed_answer)\n", " question_types.append(qa[\"type\"])\n", "\n", - "# === Initialize Tokenizer and fit on all text === #\n", - "tokenizer = Tokenizer(oov_token=\"\")\n", - "tokenizer.fit_on_texts(contexts + questions + answers)\n", + " if qa[\"type\"] == \"multiple_choice\":\n", + " incorrect_options = [opt for opt in qa[\"options\"] if opt != qa[\"answer\"]]\n", + " wrong_answers.append(incorrect_options)\n", + " else:\n", + " wrong_answers.append([])\n", + "\n", + "tokenizer = Tokenizer(oov_token=\"\")\n", + "tokenizer.fit_on_texts(contexts + questions + correct_answers)\n", + "\n", "\n", - "# === Convert Text to Sequences === #\n", "context_sequences = tokenizer.texts_to_sequences(contexts)\n", "question_sequences = tokenizer.texts_to_sequences(questions)\n", - "answer_sequences = tokenizer.texts_to_sequences(answers)\n", + "answer_sequences = tokenizer.texts_to_sequences(correct_answers)\n", "\n", - "# === Define Max Length for Padding === #\n", "MAX_LENGTH = 100\n", "context_padded = pad_sequences(context_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n", "question_padded = pad_sequences(question_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n", "answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LENGTH, padding=\"post\", truncating=\"post\")\n", "\n", - "# Encode Question Types\n", "question_type_dict = {\"fill_in_the_blank\": 0, \"true_false\": 1, \"multiple_choice\": 2}\n", "question_type_labels = np.array([question_type_dict[q_type] for q_type in question_types])\n", "\n", - "# Save the processed data (optional)\n", - "np.save(\"context_padded.npy\", context_padded)\n", - "np.save(\"question_padded.npy\", question_padded)\n", - "np.save(\"answer_padded.npy\", answer_padded)\n", - "np.save(\"question_type_labels.npy\", question_type_labels)\n", - "with open(\"tokenizer.pkl\", \"wb\") as handle:\n", - " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", - "\n", - "print(\"✅ Data processing complete!\")\n", - "print(\"Samples:\", context_padded.shape[0]) # This should now match the number of QA pairs\n" + "print(\"Data processing complete!\")\n", + "print(\"Samples:\", context_padded.shape[0]) \n" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Training samples: 76\n", - "Testing samples: 19\n" + "Training samples: 77\n", + "Testing samples: 10\n", + "Validation samples: 10\n" ] } ], "source": [ - "# === Split Data into Training and Testing Sets === #\n", - "(context_train, context_test,\n", - " question_train, question_test,\n", - " answer_train, answer_test,\n", - " qtype_train, qtype_test) = train_test_split(\n", - " context_padded,\n", - " question_padded,\n", - " answer_padded,\n", - " question_type_labels,\n", - " test_size=0.2,\n", - " random_state=42\n", + "# split text for testing 8:2\n", + "context_train, context_temp, question_train, question_temp, answer_train, answer_temp, qtype_train, qtype_temp = train_test_split(\n", + " context_padded, question_padded, answer_padded, question_type_labels, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "# split dataset testing and validation 5:5\n", + "context_test, context_val, question_test, question_val, answer_test, answer_val, qtype_test, qtype_val = train_test_split(\n", + " context_temp, question_temp, answer_temp, qtype_temp, test_size=0.5, random_state=42\n", ")\n", "\n", "print(\"Training samples:\", context_train.shape[0])\n", - "print(\"Testing samples:\", context_test.shape[0])\n" + "print(\"Testing samples:\", context_test.shape[0])\n", + "print(\"Validation samples:\", context_val.shape[0])\n" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 120, "metadata": {}, "outputs": [ { @@ -337,131 +245,195 @@ "output_type": "stream", "text": [ "Epoch 1/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 681ms/step - answer_output_accuracy: 0.0297 - answer_output_loss: 6.2965 - loss: 13.6985 - question_output_accuracy: 0.0000e+00 - question_output_loss: 6.3022 - question_type_output_accuracy: 0.2625 - question_type_output_loss: 1.0992 - val_answer_output_accuracy: 0.2044 - val_answer_output_loss: 6.2629 - val_loss: 13.6638 - val_question_output_accuracy: 0.0069 - val_question_output_loss: 6.2961 - val_question_type_output_accuracy: 0.2500 - val_question_type_output_loss: 1.1048\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 1s/step - answer_output_accuracy: 0.0344 - answer_output_loss: 6.2090 - loss: 13.5239 - question_output_accuracy: 0.0000e+00 - question_output_loss: 6.2154 - question_type_output_accuracy: 0.3004 - question_type_output_loss: 1.0991 - val_answer_output_accuracy: 0.2287 - val_answer_output_loss: 6.1669 - val_loss: 13.4815 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.2101 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1046\n", "Epoch 2/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 305ms/step - answer_output_accuracy: 0.2145 - answer_output_loss: 6.2378 - loss: 13.6041 - question_output_accuracy: 0.0127 - question_output_loss: 6.2865 - question_type_output_accuracy: 0.6076 - question_type_output_loss: 1.0785 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 6.1644 - val_loss: 13.5630 - val_question_output_accuracy: 0.0100 - val_question_output_loss: 6.2887 - val_question_type_output_accuracy: 0.3750 - val_question_type_output_loss: 1.1100\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 526ms/step - answer_output_accuracy: 0.2277 - answer_output_loss: 6.1421 - loss: 13.4196 - question_output_accuracy: 0.0113 - question_output_loss: 6.1984 - question_type_output_accuracy: 0.6445 - question_type_output_loss: 1.0780 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 6.0462 - val_loss: 13.3570 - val_question_output_accuracy: 0.0081 - val_question_output_loss: 6.2031 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.1076\n", "Epoch 3/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 297ms/step - answer_output_accuracy: 0.9846 - answer_output_loss: 6.0887 - loss: 13.4154 - question_output_accuracy: 0.0168 - question_output_loss: 6.2692 - question_type_output_accuracy: 0.5542 - question_type_output_loss: 1.0537 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 5.8030 - val_loss: 13.2110 - val_question_output_accuracy: 0.0094 - val_question_output_loss: 6.2794 - val_question_type_output_accuracy: 0.4375 - val_question_type_output_loss: 1.1287\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 528ms/step - answer_output_accuracy: 0.9837 - answer_output_loss: 5.9539 - loss: 13.1879 - question_output_accuracy: 0.0171 - question_output_loss: 6.1802 - question_type_output_accuracy: 0.5799 - question_type_output_loss: 1.0503 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 5.5439 - val_loss: 12.8565 - val_question_output_accuracy: 0.0087 - val_question_output_loss: 6.1941 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.1185\n", "Epoch 4/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 300ms/step - answer_output_accuracy: 0.9853 - answer_output_loss: 5.4983 - loss: 12.7772 - question_output_accuracy: 0.0143 - question_output_loss: 6.2471 - question_type_output_accuracy: 0.5111 - question_type_output_loss: 1.0153 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 3.9937 - val_loss: 11.4697 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.2620 - val_question_type_output_accuracy: 0.4375 - val_question_type_output_loss: 1.2140\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 533ms/step - answer_output_accuracy: 0.9839 - answer_output_loss: 5.1228 - loss: 12.2985 - question_output_accuracy: 0.0137 - question_output_loss: 6.1532 - question_type_output_accuracy: 0.5164 - question_type_output_loss: 1.0060 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 3.2875 - val_loss: 10.6708 - val_question_output_accuracy: 0.0050 - val_question_output_loss: 6.1772 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.2060\n", "Epoch 5/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 308ms/step - answer_output_accuracy: 0.9855 - answer_output_loss: 3.4938 - loss: 10.7212 - question_output_accuracy: 0.0046 - question_output_loss: 6.1959 - question_type_output_accuracy: 0.4903 - question_type_output_loss: 1.0066 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 1.4529 - val_loss: 9.4832 - val_question_output_accuracy: 0.0037 - val_question_output_loss: 6.2063 - val_question_type_output_accuracy: 0.4375 - val_question_type_output_loss: 1.8240\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 520ms/step - answer_output_accuracy: 0.9835 - answer_output_loss: 2.7939 - loss: 9.9397 - question_output_accuracy: 0.0056 - question_output_loss: 6.0862 - question_type_output_accuracy: 0.5263 - question_type_output_loss: 1.0473 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 1.1028 - val_loss: 9.0601 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.1277 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.8296\n", "Epoch 6/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 296ms/step - answer_output_accuracy: 0.9853 - answer_output_loss: 1.4168 - loss: 8.7444 - question_output_accuracy: 0.0029 - question_output_loss: 6.0149 - question_type_output_accuracy: 0.5007 - question_type_output_loss: 1.3046 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 0.7223 - val_loss: 8.8975 - val_question_output_accuracy: 0.0019 - val_question_output_loss: 6.1260 - val_question_type_output_accuracy: 0.2500 - val_question_type_output_loss: 2.0492\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 541ms/step - answer_output_accuracy: 0.9828 - answer_output_loss: 1.2315 - loss: 8.3718 - question_output_accuracy: 0.0016 - question_output_loss: 5.8773 - question_type_output_accuracy: 0.5055 - question_type_output_loss: 1.2478 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6227 - val_loss: 8.6339 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.0831 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 1.9281\n", "Epoch 7/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 316ms/step - answer_output_accuracy: 0.9855 - answer_output_loss: 0.7297 - loss: 7.7458 - question_output_accuracy: 0.0020 - question_output_loss: 5.7405 - question_type_output_accuracy: 0.5653 - question_type_output_loss: 1.2641 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 0.6319 - val_loss: 8.0634 - val_question_output_accuracy: 0.0019 - val_question_output_loss: 6.0919 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.3396\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 492ms/step - answer_output_accuracy: 0.9842 - answer_output_loss: 0.7375 - loss: 7.4714 - question_output_accuracy: 9.6824e-04 - question_output_loss: 5.5770 - question_type_output_accuracy: 0.4612 - question_type_output_loss: 1.1578 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5788 - val_loss: 7.9850 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.1148 - val_question_type_output_accuracy: 0.1250 - val_question_type_output_loss: 1.2913\n", "Epoch 8/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 300ms/step - answer_output_accuracy: 0.9852 - answer_output_loss: 0.6516 - loss: 7.1078 - question_output_accuracy: 0.0025 - question_output_loss: 5.5006 - question_type_output_accuracy: 0.5424 - question_type_output_loss: 0.9515 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 0.6445 - val_loss: 7.8441 - val_question_output_accuracy: 0.0031 - val_question_output_loss: 6.1414 - val_question_type_output_accuracy: 0.3125 - val_question_type_output_loss: 1.0582\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 455ms/step - answer_output_accuracy: 0.9847 - answer_output_loss: 0.6731 - loss: 6.9870 - question_output_accuracy: 0.0011 - question_output_loss: 5.3263 - question_type_output_accuracy: 0.5596 - question_type_output_loss: 0.9895 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6030 - val_loss: 7.8753 - val_question_output_accuracy: 0.0012 - val_question_output_loss: 6.2693 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.0031\n", "Epoch 9/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 301ms/step - answer_output_accuracy: 0.9855 - answer_output_loss: 0.5892 - loss: 6.9181 - question_output_accuracy: 0.0027 - question_output_loss: 5.2758 - question_type_output_accuracy: 0.4993 - question_type_output_loss: 1.0541 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 0.6490 - val_loss: 7.9863 - val_question_output_accuracy: 0.0031 - val_question_output_loss: 6.2535 - val_question_type_output_accuracy: 0.4375 - val_question_type_output_loss: 1.0839\n", + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 458ms/step - answer_output_accuracy: 0.9836 - answer_output_loss: 0.7391 - loss: 6.9393 - question_output_accuracy: 0.0017 - question_output_loss: 5.0887 - question_type_output_accuracy: 0.4841 - question_type_output_loss: 1.1123 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.6056 - val_loss: 8.1353 - val_question_output_accuracy: 0.0019 - val_question_output_loss: 6.4616 - val_question_type_output_accuracy: 0.5000 - val_question_type_output_loss: 1.0682\n", "Epoch 10/10\n", - "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 296ms/step - answer_output_accuracy: 0.9854 - answer_output_loss: 0.5727 - loss: 6.6346 - question_output_accuracy: 0.0027 - question_output_loss: 5.0879 - question_type_output_accuracy: 0.5319 - question_type_output_loss: 0.9754 - val_answer_output_accuracy: 0.9844 - val_answer_output_loss: 0.6354 - val_loss: 8.3545 - val_question_output_accuracy: 0.0037 - val_question_output_loss: 6.4062 - val_question_type_output_accuracy: 0.2500 - val_question_type_output_loss: 1.3129\n", - "✅ Model training complete and saved!\n" + "\u001b[1m2/2\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 454ms/step - answer_output_accuracy: 0.9847 - answer_output_loss: 0.6727 - loss: 6.6312 - question_output_accuracy: 0.0018 - question_output_loss: 4.9620 - question_type_output_accuracy: 0.5258 - question_type_output_loss: 1.0078 - val_answer_output_accuracy: 0.9856 - val_answer_output_loss: 0.5869 - val_loss: 8.5074 - val_question_output_accuracy: 0.0037 - val_question_output_loss: 6.6207 - val_question_type_output_accuracy: 0.3750 - val_question_type_output_loss: 1.2998\n" ] } ], "source": [ - "# === Model Hyperparameters === #\n", + "\n", "VOCAB_SIZE = len(tokenizer.word_index) + 1\n", "EMBEDDING_DIM = 300\n", "LSTM_UNITS = 256\n", "BATCH_SIZE = 32\n", "EPOCHS = 10\n", "\n", - "# === Build Model === #\n", - "# Encoder for Context\n", "context_input = Input(shape=(MAX_LENGTH,), name=\"context_input\")\n", "context_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"context_embedding\")(context_input)\n", "encoder_lstm = LSTM(LSTM_UNITS, return_state=True, name=\"encoder_lstm\")\n", "encoder_output, state_h, state_c = encoder_lstm(context_embedding)\n", "\n", - "# Decoder for Question (Teacher Forcing)\n", + "# Question Decoder\n", "question_decoder_input = Input(shape=(MAX_LENGTH,), name=\"question_decoder_input\")\n", "question_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, mask_zero=True, name=\"question_embedding\")(question_decoder_input)\n", "question_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name=\"question_lstm\")\n", "question_output, _, _ = question_lstm(question_embedding, initial_state=[state_h, state_c])\n", "question_dense = Dense(VOCAB_SIZE, activation=\"softmax\", name=\"question_output\")(question_output)\n", "\n", - "# Decoder for Answer\n", + "# Answer Decoder\n", "answer_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name=\"answer_lstm\")\n", "answer_output, _, _ = answer_lstm(context_embedding, initial_state=[state_h, state_c])\n", "answer_dense = Dense(VOCAB_SIZE, activation=\"softmax\", name=\"answer_output\")(answer_output)\n", "\n", - "# Classification Output for Question Type\n", "type_dense = Dense(128, activation=\"relu\")(encoder_output)\n", "question_type_output = Dense(3, activation=\"softmax\", name=\"question_type_output\")(type_dense)\n", "\n", - "# Construct the Model\n", "model = Model(\n", " inputs=[context_input, question_decoder_input],\n", - " outputs=[question_dense, answer_dense, question_type_output],\n", + " outputs=[question_dense, answer_dense, question_type_output]\n", ")\n", "\n", - "# === Compile the Model === #\n", "model.compile(\n", " optimizer=\"adam\",\n", - " loss={\n", - " \"question_output\": \"sparse_categorical_crossentropy\",\n", - " \"answer_output\": \"sparse_categorical_crossentropy\",\n", - " \"question_type_output\": \"sparse_categorical_crossentropy\",\n", - " },\n", - " metrics={\n", - " \"question_output\": [\"accuracy\"],\n", - " \"answer_output\": [\"accuracy\"],\n", - " \"question_type_output\": [\"accuracy\"],\n", - " },\n", + " loss={\"question_output\": \"sparse_categorical_crossentropy\",\n", + " \"answer_output\": \"sparse_categorical_crossentropy\",\n", + " \"question_type_output\": \"sparse_categorical_crossentropy\"},\n", + " metrics={\"question_output\": [\"accuracy\"],\n", + " \"answer_output\": [\"accuracy\"],\n", + " \"question_type_output\": [\"accuracy\"]}\n", ")\n", "\n", - "# === Train the Model === #\n", - "model.fit(\n", + "data_model = model.fit(\n", " [context_train, question_train],\n", - " {\n", - " \"question_output\": question_train,\n", - " \"answer_output\": answer_train,\n", - " \"question_type_output\": qtype_train,\n", - " },\n", + " {\"question_output\": question_train, \"answer_output\": answer_train, \"question_type_output\": qtype_train},\n", " batch_size=BATCH_SIZE,\n", " epochs=EPOCHS,\n", - " validation_split=0.2,\n", + " validation_split=0.2\n", ")\n", "\n", - "# Save the Model\n", "model.save(\"lstm_multi_output_model.keras\")\n", - "print(\"✅ Model training complete and saved!\")" + "with open(\"tokenizer.pkl\", \"wb\") as handle:\n", + " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# data epoch\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(data_model.history['question_output_accuracy'], label='Question Train Accuracy')\n", + "plt.plot(data_model.history['val_question_output_accuracy'], label='Question Val Accuracy')\n", + "plt.plot(data_model.history['answer_output_accuracy'], label='Answer Train Accuracy')\n", + "plt.plot(data_model.history['val_answer_output_accuracy'], label='Answer Val Accuracy')\n", + "plt.plot(data_model.history['question_type_output_accuracy'], label='Question Type Train Accuracy')\n", + "plt.plot(data_model.history['val_question_type_output_accuracy'], label='Question Type Val Accuracy')\n", + "plt.title('Model Accuracy')\n", + "plt.xlabel('Epoch')\n", + "plt.ylabel('Accuracy')\n", + "plt.legend()\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(data_model.history['question_output_loss'], label='Question Train Loss')\n", + "plt.plot(data_model.history['val_question_output_loss'], label='Question Val Loss')\n", + "plt.plot(data_model.history['answer_output_loss'], label='Answer Train Loss')\n", + "plt.plot(data_model.history['val_answer_output_loss'], label='Answer Val Loss')\n", + "plt.plot(data_model.history['question_type_output_loss'], label='Question Type Train Loss')\n", + "plt.plot(data_model.history['val_question_type_output_loss'], label='Question Type Val Loss')\n", + "plt.title('Model Loss')\n", + "plt.xlabel('Epoch')\n", + "plt.ylabel('Loss')\n", + "plt.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 119, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 264ms/step\n", + "\n", "=== Evaluation on Test Data ===\n", - "Classification Report for Question Type:\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 389ms/step\n", + "Classification Report for Question Type (Test Set):\n", " precision recall f1-score support\n", "\n", - " 0 0.00 0.00 0.00 8\n", - " 1 0.50 0.83 0.62 6\n", - " 2 0.44 0.80 0.57 5\n", + " 0 0.00 0.00 0.00 4\n", + " 1 0.40 0.67 0.50 3\n", + " 2 0.20 0.33 0.25 3\n", "\n", - " accuracy 0.47 19\n", - " macro avg 0.31 0.54 0.40 19\n", - "weighted avg 0.27 0.47 0.35 19\n", + " accuracy 0.30 10\n", + " macro avg 0.20 0.33 0.25 10\n", + "weighted avg 0.18 0.30 0.23 10\n", "\n", - "Accuracy: 0.47368421052631576\n", - "Precision: 0.27485380116959063\n", - "Recall: 0.47368421052631576\n", - "BLEU score for first test sample (question generation): 0\n" + "Test Accuracy: 0.3\n", + "Test Precision: 0.18000000000000002\n", + "Test Recall: 0.3\n", + "BLEU Score for first test sample (question generation): 0.02664466031983166\n", + "BLEU Score for first test sample (answer generation): 0\n", + "\n", + "=== Evaluation on Validation Data ===\n", + "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 92ms/step\n", + "Classification Report for Question Type (Validation Set):\n", + " precision recall f1-score support\n", + "\n", + " 0 0.00 0.00 0.00 4\n", + " 1 0.50 1.00 0.67 3\n", + " 2 0.25 0.33 0.29 3\n", + "\n", + " accuracy 0.40 10\n", + " macro avg 0.25 0.44 0.32 10\n", + "weighted avg 0.23 0.40 0.29 10\n", + "\n", + "Validation Accuracy: 0.4\n", + "Validation Precision: 0.225\n", + "Validation Recall: 0.4\n", + "BLEU Score for first validation sample (question generation): 0.008991061769415444\n", + "BLEU Score for first validation sample (answer generation): 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", + "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n", "/mnt/disc1/code/thesis_quiz_project/lstm-quiz/myenv/lib64/python3.10/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", @@ -474,25 +446,52 @@ } ], "source": [ - "# === Evaluate on Test Set === #\n", - "pred_question, pred_answer, pred_qtype = model.predict([context_test, question_test])\n", - "pred_qtype_labels = np.argmax(pred_qtype, axis=1)\n", "\n", - "print(\"=== Evaluation on Test Data ===\")\n", - "print(\"Classification Report for Question Type:\")\n", - "print(classification_report(qtype_test, pred_qtype_labels))\n", - "print(\"Accuracy:\", accuracy_score(qtype_test, pred_qtype_labels))\n", - "print(\"Precision:\", precision_score(qtype_test, pred_qtype_labels, average='weighted'))\n", - "print(\"Recall:\", recall_score(qtype_test, pred_qtype_labels, average='weighted'))\n", "\n", "\n", "def sequence_to_text(sequence, tokenizer):\n", - " return [tokenizer.index_word.get(idx, \"\") for idx in sequence if idx != 0]\n", + " \"\"\" Converts a tokenized sequence back into readable text. \"\"\"\n", + " return \" \".join([tokenizer.index_word.get(idx, \"\") for idx in sequence if idx != 0])\n", "\n", - "reference = [sequence_to_text(question_test[0], tokenizer)]\n", - "candidate = sequence_to_text(np.argmax(pred_question[0], axis=-1), tokenizer)\n", - "bleu_score = nltk.translate.bleu_score.sentence_bleu(reference, candidate)\n", - "print(\"BLEU score for first test sample (question generation):\", bleu_score)\n" + "print(\"\\n=== Evaluation on Test Data ===\")\n", + "pred_question_test, pred_answer_test, pred_qtype_test = model.predict([context_test, question_test])\n", + "pred_qtype_labels_test = np.argmax(pred_qtype_test, axis=1)\n", + "\n", + "print(\"Classification Report for Question Type (Test Set):\")\n", + "print(classification_report(qtype_test, pred_qtype_labels_test))\n", + "print(\"Test Accuracy:\", accuracy_score(qtype_test, pred_qtype_labels_test))\n", + "print(\"Test Precision:\", precision_score(qtype_test, pred_qtype_labels_test, average='weighted'))\n", + "print(\"Test Recall:\", recall_score(qtype_test, pred_qtype_labels_test, average='weighted'))\n", + "\n", + "reference_question_test = [sequence_to_text(question_test[0], tokenizer)]\n", + "candidate_question_test = sequence_to_text(np.argmax(pred_question_test[0], axis=-1), tokenizer)\n", + "bleu_score_question_test = nltk.translate.bleu_score.sentence_bleu(reference_question_test, candidate_question_test)\n", + "print(\"BLEU Score for first test sample (question generation):\", bleu_score_question_test)\n", + "\n", + "reference_answer_test = [sequence_to_text(answer_test[0], tokenizer)]\n", + "candidate_answer_test = sequence_to_text(np.argmax(pred_answer_test[0], axis=-1), tokenizer)\n", + "bleu_score_answer_test = nltk.translate.bleu_score.sentence_bleu(reference_answer_test, candidate_answer_test)\n", + "print(\"BLEU Score for first test sample (answer generation):\", bleu_score_answer_test)\n", + "\n", + "print(\"\\n=== Evaluation on Validation Data ===\")\n", + "pred_question_val, pred_answer_val, pred_qtype_val = model.predict([context_val, question_val])\n", + "pred_qtype_labels_val = np.argmax(pred_qtype_val, axis=1)\n", + "\n", + "print(\"Classification Report for Question Type (Validation Set):\")\n", + "print(classification_report(qtype_val, pred_qtype_labels_val))\n", + "print(\"Validation Accuracy:\", accuracy_score(qtype_val, pred_qtype_labels_val))\n", + "print(\"Validation Precision:\", precision_score(qtype_val, pred_qtype_labels_val, average='weighted'))\n", + "print(\"Validation Recall:\", recall_score(qtype_val, pred_qtype_labels_val, average='weighted'))\n", + "\n", + "reference_question_val = [sequence_to_text(question_val[0], tokenizer)]\n", + "candidate_question_val = sequence_to_text(np.argmax(pred_question_val[0], axis=-1), tokenizer)\n", + "bleu_score_question_val = nltk.translate.bleu_score.sentence_bleu(reference_question_val, candidate_question_val)\n", + "print(\"BLEU Score for first validation sample (question generation):\", bleu_score_question_val)\n", + "\n", + "reference_answer_val = [sequence_to_text(answer_val[0], tokenizer)]\n", + "candidate_answer_val = sequence_to_text(np.argmax(pred_answer_val[0], axis=-1), tokenizer)\n", + "bleu_score_answer_val = nltk.translate.bleu_score.sentence_bleu(reference_answer_val, candidate_answer_val)\n", + "print(\"BLEU Score for first validation sample (answer generation):\", bleu_score_answer_val)\n" ] } ], diff --git a/uji.py b/uji.py new file mode 100644 index 0000000..aabe8c7 --- /dev/null +++ b/uji.py @@ -0,0 +1,163 @@ +import numpy as np +import pickle +import tensorflow as tf +from tensorflow.keras.preprocessing.sequence import pad_sequences +import nltk +import random +import string +import re +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords + +# Ensure NLTK resources are available +nltk.download("punkt") +nltk.download("stopwords") + + +class QuestionGenerator: + def __init__( + self, model_path="lstm_multi_output_model.keras", tokenizer_path="tokenizer.pkl" + ): + """ + Initializes the QuestionGenerator by loading the trained model and tokenizer. + """ + # Load trained model + self.model = tf.keras.models.load_model(model_path) + + # Load tokenizer + with open(tokenizer_path, "rb") as handle: + self.tokenizer = pickle.load(handle) + + # Define question type mapping + self.question_type_dict = { + 0: "fill_in_the_blank", + 1: "true_false", + 2: "multiple_choice", + } + + # Load Indonesian stopwords + self.stop_words = set(stopwords.words("indonesian")) + + # Custom word normalization dictionary + self.normalization_dict = { + "yg": "yang", + "gokil": "kocak", + "kalo": "kalau", + "gue": "saya", + "elo": "kamu", + "nih": "ini", + "trs": "terus", + "tdk": "tidak", + "gmna": "bagaimana", + "tp": "tapi", + "jd": "jadi", + "aja": "saja", + "krn": "karena", + "blm": "belum", + "dgn": "dengan", + "skrg": "sekarang", + "msh": "masih", + "lg": "lagi", + "sy": "saya", + "sm": "sama", + "bgt": "banget", + "dr": "dari", + "kpn": "kapan", + "hrs": "harus", + "cm": "cuma", + "sbnrnya": "sebenarnya", + } + + def preprocess_text(self, text): + """ + Preprocesses the input text by: + - Converting to lowercase + - Removing punctuation + - Tokenizing + - Normalizing words + - Removing stopwords + """ + text = text.lower() + text = text.translate( + str.maketrans("", "", string.punctuation) + ) # Remove punctuation + text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces + tokens = word_tokenize(text) # Tokenization + tokens = [ + self.normalization_dict.get(word, word) for word in tokens + ] # Normalize words + tokens = [ + word for word in tokens if word not in self.stop_words + ] # Remove stopwords + return " ".join(tokens) + + def sequence_to_text(self, sequence): + """ + Converts a tokenized sequence back into readable text. + """ + return " ".join( + [ + self.tokenizer.index_word.get(idx, "") + for idx in sequence + if idx != 0 + ] + ) + + def generate_qa_from_paragraph(self, paragraph): + """ + Generates a question, answer, and question type from the given paragraph. + If it's a multiple-choice question, it also returns answer options. + """ + # Preprocess the input paragraph + processed_paragraph = self.preprocess_text(paragraph) + + # Convert text to sequence + input_seq = self.tokenizer.texts_to_sequences([processed_paragraph]) + input_seq = pad_sequences(input_seq, maxlen=100, padding="post") + + # Predict question, answer, and type + pred_question, pred_answer, pred_qtype = self.model.predict( + [input_seq, input_seq] + ) + + # Decode predictions + generated_question = self.sequence_to_text(np.argmax(pred_question[0], axis=-1)) + generated_answer = self.sequence_to_text(np.argmax(pred_answer[0], axis=-1)) + question_type_index = np.argmax(pred_qtype[0]) + generated_qtype = self.question_type_dict[question_type_index] + + # Handle multiple-choice options + options = None + if generated_qtype == "multiple_choice": + words = processed_paragraph.split() + random.shuffle(words) + distractors = [ + word for word in words if word.lower() != generated_answer.lower() + ] + options = [generated_answer] + distractors[:3] + random.shuffle(options) # Shuffle options + + # Return the generated data + return { + "generated_question": generated_question, + "generated_answer": generated_answer, + "question_type": generated_qtype, + "options": options if generated_qtype == "multiple_choice" else None, + } + + +# Initialize the question generator +qg = QuestionGenerator() + +# Example input paragraph +sample_paragraph = "Samudra Pasifik adalah yang terbesar dan terdalam di antara divisi samudra di Bumi. Samudra ini membentang dari Samudra Arktik di utara hingga Samudra Selatan di selatan dan berbatasan dengan Asia dan Australia di barat serta Amerika di timur." + +# Generate question, answer, and type +generated_result = qg.generate_qa_from_paragraph(sample_paragraph) + +# Print output +print("Generated Question:", generated_result["generated_question"]) +print("Generated Answer:", generated_result["generated_answer"]) +print("Question Type:", generated_result["question_type"]) +if generated_result["options"]: + print("Options:", generated_result["options"])