{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fb283f23",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total flattened samples: 1425\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "\n",
    "file_path = \"../dataset/stable_qg_qa_train_dataset.json\"\n",
    "\n",
    "\n",
    "raw_content = Path(file_path).read_text()\n",
    "RAW = json.loads(raw_content)\n",
    "\n",
    "samples = []\n",
    "for idx, item in enumerate(RAW):\n",
    "    try:\n",
    "        if not isinstance(item, dict):\n",
    "            print(\n",
    "                f\"[TypeError] RAW[{idx}]:  Expected dict, got {type(item)} with value: {item}\"\n",
    "            )\n",
    "            continue\n",
    "\n",
    "        for qp in item[\"qas\"]:\n",
    "            # if(qp[\"question\"] == \"\") :\n",
    "            #     print(\"found empty\", qp[\"question\"])\n",
    "            #     continue\n",
    "            question = qp[\"question\"].split()\n",
    "            answer = qp[\"answer\"].split()\n",
    "            samp = {\n",
    "                \"tokens\": [tok.lower() for tok in item[\"tokens\"]],\n",
    "                \"ner\": item[\"ner\"],\n",
    "                \"srl\": item[\"srl\"],\n",
    "                \"q_type\": qp[\"type\"],\n",
    "                \"q_toks\": [tok.lower() for tok in question] + [\"<eos>\"],\n",
    "            }\n",
    "            if isinstance(qp[\"answer\"], list):\n",
    "                samp[\"a_toks\"] = [tok.lower() for tok in answer] + [\"<eos>\"]\n",
    "            else:\n",
    "                samp[\"a_toks\"] = [qp[\"answer\"].lower(), \"<eos>\"]\n",
    "            samples.append(samp)\n",
    "\n",
    "    except KeyError as e:\n",
    "        print(f\"[KeyError] RAW[{idx}]: Missing key {e}. TOKENS: {item['tokens']}\")\n",
    "    except Exception as e:\n",
    "        print(f\"[Unexpected Error] RAW[{idx}]: {e}. TOKENS: {item['tokens']}\")\n",
    "\n",
    "print(\"Total flattened samples:\", len(samples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fa4f979d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3, 'dimana': 4, 'kartini': 5, 'lahir': 6, '___': 7, 'pada': 8, 'tanggal': 9, '21': 10, 'mei': 11, '1879': 12, 'kerajaan': 13, 'majapahit': 14, 'berdiri': 15, 'tahun': 16, '1300': 17, 'berapa': 18, 'kemerdekaan': 19, 'indonesia': 20, 'diproklamasikan': 21, 'siapa': 22, 'yang': 23, 'memproklamasikan': 24, 'lama': 25, 'bumi': 26, 'mengelilingi': 27, 'matahari': 28, 'presiden': 29, 'pertama': 30, 'planet': 31, 'apa': 32, 'paling': 33, 'dekat': 34, 'dengan': 35, 'venus': 36, 'memiliki': 37, 'suhu': 38, 'permukaan': 39, 'tinggi': 40, 'dikenal': 41, 'sebagai': 42, 'merah': 43, 'terbesar': 44, 'di': 45, 'tata': 46, 'surya': 47, 'terkenal': 48, 'cincin': 49, 'indah': 50, 'berwarna': 51, 'biru': 52, 'jauh': 53, 'dari': 54, 'apakah': 55, 'pluto': 56, 'masih': 57, 'dianggap': 58, 'soekarno': 59, 'membacakan': 60, 'teks': 61, 'proklamasi': 62, 'kapan': 63, 'sebutkan': 64, 'dibacakan': 65, 'andi': 66, 'melakukan': 67, 'pergi': 68, 'ke': 69, 'jakarta': 70, 'siti': 71, 'berangkat': 72, 'bandung': 73, 'budi': 74, 'pindah': 75, 'bali': 76, 'lina': 77, 'dan': 78, 'wati': 79, 'liburan': 80, 'medan': 81, 'agus': 82, 'wijaya': 83, 'melanjutkan': 84, 'surabaya': 85, 'nurul': 86, 'yogyakarta': 87, 'dedi': 88, 'makassar': 89, 'maya': 90, 'lestari': 91, 'roni': 92, 'tiara': 93, 'setiawan': 94, 'santoso': 95, 'saputra': 96, 'aktivitas': 97, 'dilakukan': 98, 'oleh': 99, 'maharani': 100, 'firmansyah': 101, 'gunung': 102, 'tertinggi': 103, 'dunia?': 104, 'sungai': 105, 'terpanjang': 106, 'bangunan': 107, 'dibangun': 108, 'sekitar': 109, '2560': 110, 'sm?': 111, 'benua': 112, 'keajaiban': 113, 'dunia': 114, 'berada': 115, 'italia?': 116, 'negara': 117, 'mana': 118, 'terletak': 119, 'colosseum?': 120, 'nama': 121, 'letaknya': 122, 'taj': 123, 'mahal?': 124, 'india?': 125, 'petra?': 126, 'ada': 127, 'yordania?': 128, 'china?': 129, 'meksiko?': 130, 'chichen': 131, 'itza?': 132, 'patung': 133, 'yesus': 134, 'penebus?': 135, 'brasil?': 136, 'peru?': 137, 'inggris?': 138, 'stonehenge?': 139, 'menara': 140, 'pisa?': 141, 'angkot': 142, 'wat?': 143, 'kamodja?': 144, 'ketinggian': 145, 'everest': 146, '9000': 147, 'meter': 148, 'merdeka': 149, 'merumuskan': 150, 'teori': 151, 'relativitas': 152, 'albert': 153, 'einstein': 154, '1910': 155, 'organ': 156, 'memompa': 157, 'darah': 158, 'seluruh': 159, 'tubuh': 160, 'fungsi': 161, 'jantung': 162, 'manusia': 163, 'ibukota': 164, 'jepang': 165, 'kota': 166, 'menjadi': 167, 'air': 168, 'mendidih': 169, '90': 170, 'derajat': 171, 'celsius': 172, 'penemu': 173, 'bola': 174, 'lampu': 175, 'thomas': 176, 'alva': 177, 'edison': 178, 'menemukan': 179, 'urutan': 180, 'adalah': 181, 'keempat': 182, 'pelukis': 183, 'mona': 184, 'lisa': 185, 'lukisan': 186, 'dibuat': 187, 'jarak': 188, 'dalam': 189, 'satu': 190, 'cahaya': 191, 'setara': 192, '10': 193, 'triliun': 194, 'kilometer': 195, 'pemimpin': 196, 'gerakan': 197, 'india': 198, 'mahatma': 199, 'gandhi': 200, 'memimpin': 201, 'pakistan': 202, 'nasa': 203, 'didirikan': 204, 'bagian': 205, 'terluar': 206, 'mata': 207, 'retina': 208, 'napoleon': 209, 'bonaparte': 210, 'dikalahkan': 211, 'pertempuran': 212, 'waterloo': 213, 'terjadi': 214, 'komodo': 215, 'ditemukan': 216, 'dapat': 217, 'australia': 218, 'pemenang': 219, 'nobel': 220, 'bidang': 221, 'fisika': 222, 'kimia': 223, 'marie': 224, 'curie': 225, 'memenangkan': 226, 'machu': 227, 'picchu': 228, 'situs': 229, 'peradaban': 230, 'meksiko': 231, 'dihasilkan': 232, 'fotosintesis': 233, 'selain': 234, 'glukosa': 235, 'bahan': 236, 'saja': 237, 'dibutuhkan': 238, 'seniman': 239, 'memotong': 240, 'telinganya': 241, 'sendiri': 242, 'vincent': 243, 'van': 244, 'gogh': 245, '1890': 246, 'bagaimana': 247, 'bentuk': 248, 'molekul': 249, 'dna': 250, 'struktur': 251, 'berbentuk': 252, 'penisilin': 253, 'secara': 254, 'sengaja': 255, 'setelah': 256, 'penelitian': 257, 'bertahun-tahun': 258, 'buah': 259, 'mengandung': 260, 'banyak': 261, 'vitamin': 262, 'c': 263, 'terkandung': 264, 'jeruk': 265, 'mengembangkan': 266, 'sistem': 267, 'arus': 268, 'listrik': 269, 'bolak-balik': 270, 'panjang': 271, 'nil': 272, 'hewan': 273, 'pernah': 274, 'hidup': 275, 'gajah': 276, 'memproduksi': 277, 'insulin': 278, 'mengemukakan': 279, 'evolusi': 280, 'dikemukakan': 281, 'isaac': 282, 'newton': 283, 'membangun': 284, 'mahal': 285, 'untuk': 286, 'asia': 287, 'luas': 288, '44.58': 289, 'juta': 290, 'km²': 291, 'afrika': 292, 'utara': 293, 'laut': 294, 'mediterania': 295, 'manakah': 296, 'terkecil': 297, 'kedua': 298, 'eropa': 299, 'lebih': 300, 'besar': 301, 'samudera': 302, 'berbatasan': 303, 'amerika': 304, 'timur': 305, 'hindia': 306, 'barat': 307, 'selatan': 308, 'terdiri': 309, 'hutan': 310, 'amazon': 311, 'belahan': 312, 'kutub': 313, 'antartika': 314, 'hampir': 315, 'seluruhnya': 316, 'tertutup': 317, 'es': 318, 'populasi': 319, '4.7': 320, 'miliar': 321, 'penduduk': 322, 'jumlah': 323, 'kilimanjaro': 324, '5,895': 325, 'gurun': 326, 'ketiga': 327, 'sahara': 328, 'merupakan': 329, 'peringkat': 330, '6,650': 331, 'km': 332, 'pegunungan': 333, 'alpen': 334, 'membentang': 335, '8': 336, 'danau': 337, 'superior': 338, 'tawar': 339, 'menghadiri': 340, 'turnamen': 341, 'catur': 342, 'ali': 343, '15': 344, 'juli': 345, '2023': 346, 'rapat': 347, 'organisasi': 348, 'nina': 349, '25': 350, 'desember': 351, 'farhan': 352, 'workshop': 353, 'fotografi': 354, 'pameran': 355, 'teknologi': 356, '5': 357, 'malang': 358, 'iqbal': 359, 'perlombaan': 360, 'renang': 361, 'padang': 362, 'konser': 363, 'musik': 364, 'agustus': 365, 'fajar': 366, 'dina': 367, '1': 368, 'januari': 369, '2024': 370, 'festival': 371, 'kuliner': 372, 'rian': 373, 'bazar': 374, 'amal': 375, 'tari': 376, 'seminar': 377, 'pendidikan': 378, 'kompetisi': 379, 'robotik': 380, 'rudi': 381, 'semarang': 382, 'putri': 383, 'hana': 384, 'raka': 385, 'dewi': 386, 'pahlawan': 387, 'jawa': 388, 'pelajar': 389, 'kaya': 390, 'akan': 391, 'budaya': 392, 'pusat': 393, 'pemerintahan': 394, 'kembang': 395, 'fashion': 396, 'sejuk': 397, 'destinasi': 398, 'wisata': 399, 'alam': 400, 'pulau': 401, 'dewata': 402, 'masakan': 403, 'rendang': 404, 'mendunia': 405, 'pelabuhan': 406, 'utama': 407, 'sumatra': 408, 'khas': 409, 'kemerdekaannya': 410, 'sumpah': 411, 'pemuda': 412, 'diikrarkan': 413, 'isi': 414, 'proses': 415, 'berlangsung': 416, 'diubah': 417, 'pembelahan': 418, 'mitosis': 419, 'sel': 420, 'menghasilkan': 421, 'gamet': 422, 'benda': 423, 'ditarik': 424, 'magnet': 425, 'dilaksanakan': 426, 'ginjal': 427, 'paru-paru': 428, 'ekskresi': 429, 'hati': 430, 'itu': 431, 'ramah': 432, 'lingkungan': 433, 'dampak': 434, 'negatif': 435, 'penerapan': 436, 'bioteknologi': 437, 'pembentukan': 438, 'urine': 439, 'peredaran': 440, 'berperan': 441, 'penting': 442, 'masa': 443, 'reformasi': 444, 'dimulai': 445, 'peristiwa': 446, 'politik': 447, 'pencernaan': 448, 'penyerapan': 449, 'zat': 450, 'makanan': 451, 'pernapasan': 452, 'contoh': 453, 'alat': 454, 'optik': 455, 'kehidupan': 456, 'sehari-hari': 457, 'kacamata': 458, 'pembuluh': 459, 'xilem': 460, 'tumbuhan': 461, 'floem': 462, 'pemanfaatan': 463, 'getaran': 464, 'gelombang': 465, 'hasil': 466, 'gangguan': 467, 'penyebab': 468, 'penyakit': 469, 'asma': 470, 'mohammad': 471, 'hatta': 472, 'bung': 473, 'tomo': 474, 'i': 475, 'gusti': 476, 'ngurah': 477, 'rai': 478, 'gugur': 479, 'cut': 480, 'nyak': 481, 'dien': 482, 'wafat': 483, 'teuku': 484, 'umar': 485, 'wahidin': 486, 'sudirohusodo': 487, 'sultan': 488, 'mahmud': 489, 'badaruddin': 490, 'ii': 491, 'kh': 492, 'ahmad': 493, 'dahlan': 494, 'hasyim': 495, \"asy'ari\": 496, 'ageng': 497, 'tirtayasa': 498, 'hasanuddin': 499, 'pattimura': 500, 'pangeran': 501, 'diponegoro': 502, 'sentot': 503, 'alibasya': 504, 'prawirodirjo': 505, 'cipto': 506, 'mangunkusumo': 507, 'ernest': 508, 'douwes': 509, 'dekker': 510, 'dr.': 511, 'mas': 512, 'mansur': 513, 'sutan': 514, 'sjahrir': 515, 'abdul': 516, 'muis': 517, 'otto': 518, 'iskandardinata': 519, 'abikusno': 520, 'tjokrosujoso': 521, 'wahid': 522, 'bpupki': 523, 'ketua': 524, 'ppki': 525, 'pendiri': 526, 'nahdlatul': 527, 'ulama': 528, 'jong': 529, 'islamieten': 530, 'bond': 531, 'muhammadiyah': 532, 'muda': 533, 'perhimpunan': 534, 'partai': 535, 'nasional': 536, 'utomo': 537, 'tokoh': 538, 'sarekat': 539, 'islam': 540, 'voc': 541, 'dibubarkan': 542, 'komunis': 543, 'fonds': 544, 'mardika': 545, 'kutai': 546, 'peranan': 547, 'mahakam': 548, 'bagi': 549, 'perekonomian': 550, 'sumber': 551, 'sejarah': 552, 'raja': 553, 'memerintah': 554, 'saat': 555, 'yupa': 556, 'dikeluarkan': 557, 'ditulis': 558, 'huruf': 559, 'bahasa': 560, 'prasasti': 561, 'diperkirakan': 562, 'kakek': 563, 'mulawarman': 564, 'dinasti': 565, 'lembu': 566, 'dikorbankan': 567, 'zaman': 568, 'keemasan': 569, 'melalui': 570, 'ekonomi': 571, 'berkembang': 572, 'pesat': 573, 'jalur': 574, 'perdagangan': 575, 'internasional': 576, 'hingga': 577, 'memberi': 578, 'sedekah': 579, 'sapi': 580, 'diberikan': 581, 'kepada': 582, 'mengalami': 583, 'digunakan': 584, 'tarumanegara': 585, 'abad': 586, 'memerintahkan': 587, 'penggalian': 588, 'candrabaga': 589, 'ekor': 590, 'dipersembahkan': 591, 'ditonjolkan': 592, 'cidanghiang': 593, 'dilambangkan': 594, 'gambar': 595, 'telapak': 596, 'kaki': 597, 'kebon': 598, 'kopi': 599, 'saturnus': 600, 'mars': 601, 'bintang': 602, 'dihuni': 603, 'makhluk': 604, 'sekarang': 605, 'dikategorikan': 606, 'satelit': 607, 'dimiliki': 608, 'jupiter': 609, 'bernama': 610, 'alami': 611, 'berputar': 612, 'miring': 613, 'terhadap': 614, 'porosnya': 615, 'disebut': 616, 'panas': 617, 'tersusun': 618, 'dilalui': 619, 'tidak': 620, 'siang': 621, 'malam': 622, 'karena': 623, 'habibie': 624, 'megawati': 625, 'yudhoyono': 626, 'widodo': 627, 'sri': 628, 'mulyani': 629, 'shihab': 630, 'agnez': 631, 'mo': 632, 'zain': 633, 'dian': 634, 'lahir?': 635, '___?': 636, 'kalingga': 637, 'berasal': 638, 'sebuah': 639, 'arti': 640, 'kudungga?': 641, 'kudungga': 642, 'berubah': 643, 'buleleng': 644, 'terletak?': 645, 'sengit': 646, 'antara': 647, 'para': 648, 'pejuang': 649, 'dibantu': 650, '___.': 651, 'melawan': 652, 'berganti': 653, 'putra-putri': 654, 'raja?': 655, 'letak': 656, 'kompleks': 657, 'trowulan': 658, 'inilah': 659, 'majapahit?': 660, 'samudra': 661, 'pasai': 662, 'pantai': 663, 'malaka': 664, 'malaka?': 665, 'demak': 666, 'demak?': 667, 'aceh?': 668, 'aceh': 669, 'banten?': 670, 'makasar': 671, 'sangat': 672, 'strategis': 673, 'pelayaran': 674, 'mataram': 675, 'mulai': 676, 'pemindahan': 677, 'hayam': 678, 'wuruk': 679, 'perkembangan': 680, 'memperkuat': 681, 'kedudukan': 682, 'sriwijaya?': 683, 'sriwijaya': 684, '1293?': 685, '650?': 686, '1267?': 687, '1400?': 688, '1500?': 689, '1514?': 690, 'banten': 691, '1527?': 692, '1605?': 693, '1586?': 694, '400?': 695, 'tarumanagara': 696, '358?': 697, '580?': 698, 'sunda': 699, '669?': 700, 'galuh': 701, '612?': 702, 'kediri': 703, '1042?': 704, 'singhasari': 705, '1222?': 706, 'kartanegara': 707, '1296?': 708, 'pajajaran': 709, '1482?': 710, 'blambangan': 711, '1630?': 712, 'warmadewa': 713, '915?': 714, 'mengucapkan': 715, 'palapa?': 716, 'palapa': 717, 'diucapkan?': 718, 'membawa': 719, 'kuno': 720, 'mencapai': 721, 'puncak': 722, 'kejayaan?': 723, 'balitung': 724, 'iskandar': 725, 'memerintah?': 726, 'canggal': 727, 'berangka?': 728, 'awal': 729, 'berdirinya?': 730, 'singhasari?': 731, 'ken': 732, 'arok': 733, 'mengalahkan': 734, 'kediri?': 735, 'berkuasa': 736, 'ke-___': 737, 'ke-___?': 738, 'bukti': 739, 'keberadaan': 740, 'diketahui': 741, 'apa?': 742, 'wilayah': 743, 'ditemukan?': 744, 'penyerangan': 745, 'batavia?': 746, 'tujuan': 747, 'tertua': 748, 'indonesia?': 749, 'kutai?': 750, 'ke-9': 751, 'm?': 752, 'balaputradewa': 753, 'menjalin': 754, 'hubungan': 755, 'diplomatik': 756, 'kejayaan': 757, 'siapa?': 758, 'patih': 759, 'berapa?': 760, 'membagi': 761, 'kahuripan': 762, 'dua?': 763, 'dibagi': 764, 'janggala': 765, '1350': 766, '1389': 767, 'dipimpin': 768, 'tersebut': 769, 'maritim?': 770, 'kedukan': 771, 'bukit': 772, 'tepi': 773, 'talang': 774, 'tuo': 775, 'sebelah': 776, 'daerah': 777, 'respirasi': 778, 'pertukaran': 779, 'gas': 780, 'gas?': 781, 'atas': 782, '___,': 783, 'faring,': 784, 'laring,': 785, 'trakea,': 786, 'bronkus,': 787, 'paru-paru?': 788, 'hidung,': 789, 'kurang': 790, 'daratan?': 791, 'lautan?': 792, 'perubahan': 793, 'muka': 794, 'dipengaruhi': 795, 'tenaga': 796, 'endogen?': 797, 'eksogen': 798, 'tanah': 799, 'komponen': 800, 'keberlangsungan': 801, 'bumi?': 802, 'keturunan': 803, 'putra': 804, 'memengaruhi': 805, 'kondisi': 806, 'ekosistem': 807, 'suatu': 808, 'tempat?': 809, 'iklim': 810, 'cuaca': 811, 'atau': 812, 'tertinggi?': 813, 'dataran': 814, 'gunung-gunung': 815, 'datar': 816, '400': 817, 'mdpal?': 818, 'dibandingkan': 819, 'sekitarnya?': 820, 'oetomo': 821, 'berdiri?': 822, 'pelopor': 823, 'kebangsaan': 824, 'kepanjangan': 825, 'bpupki?': 826, 'dibentuk?': 827, 'dibentuk': 828, 'selanjutnya?': 829, 'singkatan': 830, 'panitia': 831, 'persiapan': 832, 'sdi?': 833, 'dagang': 834, 'islam?': 835, 'bu?': 836, 'didirikan?': 837, 'si?': 838, 'sdi': 839, 'diganti': 840, '1912?': 841, 'bercorak': 842, '1912': 843, 'keagamaan': 844, 'bersifat': 845, 'modern': 846, '18': 847, 'november': 848, 'muhammadiyah?': 849, '31': 850, '1926?': 851, 'nu': 852, 'pgri?': 853, 'himpunan': 854, 'pengusaha': 855, 'revolusioner': 856, 'sebelum': 857, 'pni': 858, 'trikoro': 859, 'darmo': 860, 'kepanduan': 861, 'hizbul': 862, 'wathan': 863, 'perempuan': 864, 'puteri': 865, '4': 866, '1927': 867, '1927?': 868, 'musi': 869, 'ini': 870, 'berangka': 871, '605': 872, 'saka': 873, '683': 874, 'm': 875, 'mengadakan': 876, 'perjalanan': 877, 'suci': 878, 'siddhayatra': 879, 'dapunta': 880, 'hyang': 881, 'menggunakan': 882, 'kereta': 883, 'ia': 884, 'minangatamwan': 885, '20.000': 886, 'personel': 887, 'lepas': 888, 'sosok': 889, 'dipandang': 890, 'hindu-buddha': 891, 'mahakam,': 892, 'kalimantan': 893, 'cukup': 894, 'beberapa': 895, 'anak': 896, 'pertemuan': 897, 'sungainya': 898, 'dahulu': 899, 'muarakaman': 900, 'dilayari': 901, 'sampai': 902, 'masuk': 903, 'muara': 904, 'posisi': 905, 'menguntungkan': 906, 'meningkatkan': 907, 'masyarakat': 908, 'kesejahteraan': 909, 'berupa': 910, 'batu': 911, 'bertulis.': 912, 'juga': 913, 'tugu': 914, 'peringatan': 915, 'upacara': 916, 'kematian.': 917, 'candrabhaga.': 918, 'hanya': 919, 'kuno.': 920, 'ahli': 921, 'berpendapat': 922, 'bahwa': 923, 'ke-5': 924, 'melihat': 925, 'm.': 926, 'disebutkan?': 927, 'hal': 928, 'menarik': 929, 'mulawarman,': 930, 'yaitu': 931, 'kudungga.': 932, 'pengaruh': 933, 'menyebabkan': 934, 'kerajaan.': 935, 'putera': 936, 'cucu': 937, 'aswawarman.': 938, 'wangsakerta?': 939, 'wangsakerta': 940, 'mulawarman.': 941, 'informasi': 942, 'tentang': 943, 'silsilah': 944, 'mempunyai': 945, 'aswawarman': 946, 'dikatakan': 947, 'seperti': 948, 'ansuman': 949, 'dewa': 950, 'matahari.': 951, 'empat': 952, 'anak.': 953, 'kutai.': 954, 'pemeluk': 955, 'agama': 956, 'memeluk': 957, 'hindu-siwa.': 958, 'tempat': 959, 'dinamakan': 960, 'wapraksetra.': 961, 'kaum': 962, 'rakyat.': 963, 'dermawan.': 964, 'kurban': 965, 'emas': 966, '10.000': 967, 'lembu.': 968, 'brahmana': 969, 'mendirikan': 970, 'candi': 971, 'kurban.': 972, '____': 973, 'keemasan.': 974, 'pun': 975, '____.': 976, 'perkembangan.': 977, 'sungai.': 978, 'pertanian.': 979, 'mereka': 980, 'perdagangan.': 981, 'bahkan': 982, 'sudah': 983, 'dagang.': 984, 'luar.': 985, 'melewati': 986, 'makassar.': 987, 'selat': 988, 'sunda.': 989, 'pedagang': 990, 'pelayarannya': 991, 'singgah': 992, 'terlebih': 993, 'pelayarannya,': 994, 'sapi?': 995, 'disedekahkan?': 996, 'sang': 997, 'brahmana.': 998, 'ditempatkan': 999, 'berkaitan': 1000, 'pengendalian': 1001, 'banjir': 1002, 'pengairan': 1003, 'tarumanegara.': 1004, 'menggali': 1005, 'candrabaga?': 1006, 'digali': 1007, 'purnawarman?': 1008, 'purnawarman': 1009, 'mengendalikan': 1010, 'banjir.': 1011, 'dipersembahkan?': 1012, 'mempersembahkan': 1013, '1.000': 1014, 'berkat': 1015, 'makmur?': 1016, 'makmur': 1017, 'tanpa': 1018, 'bantuan': 1019, 'timur,': 1020, 'muncul': 1021, 'timur.': 1022, 'barat.': 1023, 'cisadane': 1024, 'citarum': 1025, 'musi.': 1026, 'kata': 1027, 'taruma': 1028, 'mungkin': 1029, 'tarum': 1030, 'artinya': 1031, 'emas.': 1032, 'dipakai': 1033, 'aliran': 1034, 'citarum.': 1035, 'berdasarkan': 1036, 'tugu,': 1037, 'purbacaraka': 1038, 'memperkirakan': 1039, 'bogor.': 1040, 'telah': 1041, 'prasasti-prasasti': 1042, 'ditemukan.': 1043, 'terkait': 1044, 'lima': 1045, 'berhuruf': 1046, 'berbahasa': 1047, 'sanskerta': 1048, 'pallawa': 1049, 'melayu.': 1050, 'mengeluarkan': 1051, 'inskripsi': 1052, 'ini?': 1053, 'kampung': 1054, 'tumbuh,': 1055, 'desa': 1056, 'tanjung': 1057, 'priok,': 1058, 'jakarta.': 1059, 'dituliskan': 1060, 'baris': 1061, 'tulisan?': 1062, 'aksara': 1063, 'ditulis?': 1064, 'latin': 1065, 'kali': 1066, 'candrabhaga?': 1067, 'candrabhaga': 1068, 'mengalirkannya': 1069, 'laut.': 1070, 'gomati?': 1071, 'gomati': 1072, 'mengalir': 1073, 'tengah-tengah': 1074, 'kediaman': 1075, 'pandeta': 1076, 'nenekda.': 1077, 'hari': 1078, 'pekerjaan': 1079, 'selesai?': 1080, 'galian': 1081, 'dibuat?': 1082, 'selesai': 1083, 'hari.': 1084, 'panjangnya': 1085, 'km.': 1086, 'selamatan': 1087, 'selamatan?': 1088, 'disertai': 1089, 'persembahan': 1090, 'sapi.': 1091, 'milik': 1092, 'menyatakan': 1093, 'bekas': 1094, 'dua': 1095, 'wisnu.': 1096, 'membaca': 1097, 'b': 1098, 'cri': 1099, 'tji': 1100, 'aroe': 1101, 'eun': 1102, 'waca?': 1103, 'bacaan': 1104, 'purnavarmma': 1105, 'padam?': 1106, 'h': 1107, 'kern': 1108, 'waca.': 1109, 'muara,': 1110, 'ciaruetun': 1111, 'hilir,': 1112, 'cibungbulang,': 1113, 'dipahat': 1114, 'prasastinya?': 1115, 'prasastinya': 1116, 'dipahatkan': 1117, 'baris.': 1118, 'sepasang': 1119, 'digambarkan': 1120, 'penguasa': 1121, 'taruma.': 1122, 'cianten': 1123, 'ciaruteun': 1124, 'jambu': 1125, '(pasir': 1126, 'koleangkak)': 1127, 'inskripsinya': 1128, 'tulisan.': 1129, 'tiada': 1130, 'taranya?': 1131, 'sekali': 1132, 'waktu?': 1133, 'baju': 1134, 'zirah': 1135, 'ditembus': 1136, 'senjata': 1137, 'musuh.': 1138, 'senantiasa': 1139, 'berhasil': 1140, 'menggempur': 1141, 'musuh?': 1142, 'sepatu': 1143, 'duri': 1144, 'daging': 1145, 'musuh-musuhnya.': 1146, 'tengah.': 1147, 'baris?': 1148, 'ditandai': 1149, 'menyebut': 1150, 'panji': 1151, 'sekalian': 1152, 'raja-raja.': 1153, 'siapakah': 1154, 'kalingga?': 1155, 'ratu': 1156, 'sima': 1157, 'kalingga.': 1158, 'digambarkan?': 1159, 'wanita': 1160, 'taat': 1161, 'peraturan.': 1162, 'kalinga': 1163, 'selatan.': 1164, 'menurut': 1165, 'berita': 1166, 'lokasi': 1167, 'poli?': 1168, 'poli': 1169, 'bali.': 1170, 'prasasti?': 1171, 'berbatasan?': 1172, 'diperkirakan?': 1173, 'muria.': 1174, 'mengenai': 1175, 't’ang': 1176, 'termasuk': 1177, 'lain': 1178, 'tuk': 1179, 'merapi.': 1180, 'cina': 1181, 'membantu': 1182, 'kita': 1183, 'mengetahui': 1184, 'kira-kira': 1185, 'berkembang?': 1186, '674': 1187, 'dikenal?': 1188, 'lemah': 1189, 'bijaksana.': 1190, 'hukum': 1191, 'dilaksanakan?': 1192, 'tegas.': 1193, 'patuh': 1194, 'peraturan?': 1195, 'rakyat': 1196, 'meletakkan': 1197, 'pundi-pundi': 1198, 'tengah': 1199, 'jalan?': 1200, 'kolam.': 1201, 'mengusik?': 1202, 'orang': 1203, 'mengusik': 1204, 'itu.': 1205, 'menyentuh': 1206, 'kantong': 1207, 'kakinya?': 1208, 'anggota': 1209, 'keluarga': 1210, 'istana': 1211, 'tangannya.': 1212, 'hukuman': 1213, 'harus': 1214, 'diterima': 1215, 'itu?': 1216, 'dinilai': 1217, 'bersalah': 1218, 'diberi': 1219, 'potong': 1220, 'tangan.': 1221, 'akhirnya': 1222, 'dijatuhkan': 1223, 'usul': 1224, 'menteri?': 1225, 'menteri': 1226, 'membuat': 1227, 'ditunjukkan': 1228, 'kisah': 1229, 'sima?': 1230, 'menunjukkan': 1231, 'kelemahan': 1232, 'sima.': 1233, 'dibedakan': 1234, 'membedakan': 1235, 'kerabatnya': 1236, 'sendiri.': 1237, 'dianut': 1238, 'umumnya?': 1239, 'menganut': 1240, 'hindu': 1241, 'umumnya.': 1242, 'pesat?': 1243, 'buddha': 1244, 'pesat.': 1245, 'pendeta': 1246, 'datang': 1247, 'hwi-ning': 1248, 'tinggal': 1249, 'selama': 1250, 'tahun.': 1251, 'kitab': 1252, 'diterjemahkan': 1253, 'hwi-ning?': 1254, 'diterjemahkan?': 1255, 'menerjemahkan': 1256, 'pun.': 1257, 'kepemimpinan': 1258, 'adil': 1259, 'menjadikan': 1260, 'kacau': 1261, 'aman': 1262, 'bawah': 1263, 'adil.': 1264, 'pencaharian': 1265, 'umumnya': 1266, 'berdagang.': 1267, 'bertani?': 1268, 'dialami': 1269, 'kemunduran': 1270, 'kemungkinan': 1271, 'akibat': 1272, 'serangan': 1273, 'sriwijaya.': 1274, 'menyingkir': 1275, 'timur?': 1276, 'kijen': 1277, 'sejak': 1278, 'permulaan': 1279, 'tarikh': 1280, 'ramai?': 1281, 'kepulauan': 1282, 'ramai': 1283, 'masehi.': 1284, 'perdagangan?': 1285, 'dikunjungi': 1286, 'pedagang.': 1287, 'kemudian?': 1288, 'pusat-pusat': 1289, 'kecil': 1290, 'ke-7.': 1291, 'kejayaannya?': 1292, 'melayu': 1293, 'kejayaannya.': 1294, 'sempat': 1295, 'berada?': 1296, 'pusatnya': 1297, 'palembang.': 1298, 'bukanlah': 1299, 'pallawa.': 1300, 'sanskerta.': 1301, '(683': 1302, 'm).': 1303, 'siddhayatra?': 1304, 'kuda.': 1305, 'berangkat?': 1306, 'personel.': 1307, '606': 1308, '(684': 1309, 'disebutkan': 1310, 'menyebutkan': 1311, 'pembangunan': 1312, 'sriksetra.': 1313, 'taman': 1314, 'tersebut?': 1315, 'telaga': 1316, 'isinya': 1317, 'terutama': 1318, 'taman.': 1319, 'kapur': 1320, 'permintaan': 1321, 'meminta': 1322, 'menghukum': 1323, 'setia.': 1324, 'karang': 1325, 'berahi': 1326, '608': 1327, '(686': 1328, 'sama': 1329, 'berbeda': 1330, 'kapur.': 1331, 'ligor': 1332, 'nalanda': 1333, 'bangka.': 1334, 'penting?': 1335, 'bukan': 1336, 'penting.': 1337, 'berita?': 1338, 'i-tsing': 1339, 'mendorong': 1340, 'dijalankan': 1341, 'musi?': 1342, 'kegiatan': 1343, 'keadaan': 1344, 'tepat?': 1345, 'menghubungkan': 1346, 'mana?': 1347, 'berbakat': 1348, 'geografis': 1349, 'palembang': 1350, 'runtuh': 1351, 'vietnam?': 1352, 'funan': 1353, 'kamboja.': 1354, 'kesempatan': 1355, 'cepat?': 1356, 'ke-8': 1357, 'gelar': 1358, 'awalnya': 1359, 'maharaja': 1360, 'sebutan': 1361, 'tertulis': 1362, 'tuo.': 1363, 'ke-7?': 1364, 'ke-7,': 1365, 'tulang-bawang': 1366, 'lampung.': 1367, 'kedah': 1368, 'semenanjung': 1369, 'usaha': 1370, 'melaporkan': 1371, 'penaklukan': 1372, 'kedah?': 1373, '682-685': 1374, 'bangka': 1375, 'pedalaman': 1376, 'jawa.': 1377, 'menguasai': 1378, '686': 1379, 'penguasaan': 1380, 'didasarkan': 1381, 'berusaha': 1382, 'menaklukkan': 1383, 'bhumi': 1384, 'java': 1385, 'setia': 1386, 'kepadanya.': 1387, 'dimaksud': 1388, 'mencakup': 1389, 'jambi': 1390, 'genting': 1391, 'kra': 1392, 'kra?': 1393, 'berlabuh': 1394, 'dahulu?': 1395, 'berlayar?': 1396, '775': 1397, 'berasal?': 1398, 'diduga': 1399, 'serangan?': 1400, 'ingin': 1401, 'dikuasai': 1402, 'mengapa': 1403, 'terus': 1404, 'perluasan': 1405, 'sehingga': 1406, 'besar.': 1407, 'pangkalan': 1408, 'dibangun?': 1409, 'waktu': 1410, 'darmasetra': 1411, 'balaputradewa.': 1412, 'dicapai': 1413, 'pemerintahannya?': 1414, 'ibu': 1415, 'balaputradewa?': 1416, 'diterangkan?': 1417, 'erat?': 1418, 'benggala': 1419, 'diberikan?': 1420, 'arsitektur': 1421, 'asrama': 1422, 'kesamaan?': 1423, 'diperhatikan': 1424, 'generasi': 1425, 'mudanya?': 1426, '990': 1427, 'sudamaniwarmadewa': 1428, 'darmawangsa': 1429, 'menggagalkan': 1430, 'digagalkan': 1431, 'tentara': 1432, 'menggantikan': 1433, 'sudamaniwarmadewa?': 1434, 'marawijayottunggawarman': 1435, 'raja.': 1436, 'membina': 1437, 'marawijayottunggawarman?': 1438, 'mempertahankan': 1439, 'kebesarannya': 1440, 'kekuasaan': 1441, 'salah': 1442, 'luar': 1443, 'sumatra.': 1444, 'perairan': 1445, 'menguasai?': 1446, 'pertama?': 1447, 'muhammad': 1448, 'yamin.': 1449, 'mulanya?': 1450, 'mulanya': 1451, 'bertani.': 1452, 'lambat': 1453, 'berkembang.': 1454, 'pokok?': 1455, 'kemudian': 1456, 'pokok.': 1457, 'mendukung': 1458, 'maupun': 1459, 'internasional?': 1460, 'memberikan': 1461, 'kemakmuran': 1462, 'membayar': 1463, 'pajak?': 1464, 'kapal-kapal': 1465, 'bongkar': 1466, 'muat': 1467, 'perlu': 1468, 'pajak.': 1469, 'diekspor': 1470, 'barang': 1471, 'impor': 1472, 'sriwijaya!': 1473, 'mengimpor': 1474, 'kayu': 1475, 'manis': 1476, 'kemenyan.': 1477, 'maritim.': 1478, 'maritim': 1479, 'mengandalkan?': 1480, 'mengandalkan': 1481, 'perekonomiannya': 1482, 'kedudukannya?': 1483, 'membentuk': 1484, 'angkatan': 1485, 'darat': 1486, 'kedudukannya.': 1487, 'mampu': 1488, 'armadanya?': 1489, 'armadanya': 1490, 'mengawasi': 1491, 'nusantara.': 1492, 'jaminan': 1493, 'keamanan': 1494, 'beragama': 1495, 'semarak?': 1496, 'semarak.': 1497, 'tenggara?': 1498, 'tenggara.': 1499, 'menceritakan': 1500, 'i-tsing?': 1501, 'terkenal?': 1502, 'seorang': 1503, 'sakyakirti.': 1504, 'asing': 1505, 'belajar': 1506, 'buddha?': 1507, 'tibet': 1508, '1011-1023?': 1509, 'kebudayaan': 1510, 'kampar?': 1511, 'siguntang?': 1512, 'wihara': 1513, 'nagipattana?': 1514, 'erat.': 1515, 'terdapat': 1516, 'bagaimana?': 1517, 'lumpur?': 1518, 'akibatnya': 1519, 'baik': 1520, 'pengawasan': 1521, 'sulit?': 1522, 'segi': 1523, 'mendapat': 1524, 'menyerang': 1525, '1017': 1526, 'ditahan': 1527, 'pihak': 1528, 'colamandala?': 1529, 'ekspedisi': 1530, 'kertanegara?': 1531, '1377?': 1532, 'diakhiri': 1533, 'baru': 1534, 'pertengahan': 1535, 'ke-8.': 1536, 'belum': 1537, 'dipastikan': 1538, 'kuno?': 1539, 'pasti': 1540, 'diketahui.': 1541, 'poh': 1542, 'pitu.': 1543, 'kejelasan': 1544, 'pitu': 1545, 'sekarang?': 1546, 'diterangkan': 1547, 'utara.': 1548, 'ada.': 1549, 'kedu': 1550, 'prambanan.': 1551, 'sanjaya': 1552, 'sanna': 1553, 'sanjaya.': 1554, 'sanna?': 1555, 'sanjaya?': 1556, 'sanaha': 1557, 'saudara': 1558, 'sanna.': 1559, 'sojomerto': 1560, 'syailendra': 1561, 'buddha.': 1562, 'menurunkan': 1563, 'jawa?': 1564, 'tampil': 1565, '717-780': 1566, 'kekuasaannya': 1567, 'dilanjutkan?': 1568, 'raja-raja': 1569, 'bawahan': 1570, 'suci?': 1571, 'lingga': 1572, 'lambang': 1573, 'sikap': 1574, 'pujangga': 1575, 'hormat?': 1576, 'rakyat?': 1577, 'didukung': 1578, 'panangkaran?': 1579, 'kalasan': 1580, 'lengkap': 1581, 'panangkaran': 1582, 'kalasan?': 1583, 'dipindahkan?': 1584, 'penakluk': 1585, 'gagah': 1586, 'berani?': 1587, 'berani.': 1588, 'bertambah': 1589, 'luas?': 1590, 'luas.': 1591, 'syailendra?': 1592, 'permata': 1593, 'syailendra.': 1594, 'mahayana': 1595, 'diperintahkan': 1596, 'suci!': 1597, 'masalah': 1598, 'timbul': 1599, 'perpecahan': 1600, 'terjadi?': 1601, 'kelompok': 1602, 'utara?': 1603, 'selatan?': 1604, 'hindu?': 1605, 'utara!': 1606, 'wayang': 1607, 'dipakai!': 1608, 'meninggalkan': 1609, 'candi-candi': 1610, 'ditinggalkan!': 1611, 'borobudur?': 1612, 'borobudur': 1613, 'lama?': 1614, 'bersatu': 1615, 'kembali?': 1616, 'ditandai?': 1617, 'perkawinan': 1618, 'periode': 1619, 'sering': 1620, 'membicarakan': 1621, 'lokal?': 1622, 'globalisasi': 1623, 'lokal.': 1624, 'bangsa': 1625, 'nyata': 1626, 'dikukuhkan': 1627, 'unesco': 1628, '1991?': 1629, 'warisan': 1630, '1991.': 1631, 'tepatnya?': 1632, 'garis': 1633, 'lurus?': 1634, 'sesuai?': 1635, 'pedoman': 1636, 'agama?': 1637, 'menjelaskan': 1638, 'aturan-aturan': 1639, 'desain?': 1640, 'karya': 1641, 'unik.': 1642, 'susunan': 1643, 'fondasi': 1644, 'berdenah': 1645, 'jari-jari': 1646, 'titik': 1647, 'fondasi?': 1648, 'stupa': 1649, 'perwujudan': 1650, 'kamadatu?': 1651, 'arupadatu?': 1652, 'tingkat': 1653, 'arah': 1654, 'relief': 1655, 'dinding': 1656, 'barobudur?': 1657, 'hubungan?': 1658, 'mula-mula': 1659, 'ritual': 1660, 'dilakukan?': 1661, 'berciri': 1662, 'mahayana?': 1663, 'dilihat': 1664, 'arca': 1665, 'tergambar': 1666, 'menyebarkan': 1667, 'ajaran': 1668, 'ke-10?': 1669, 'disebarkan?': 1670, 'mewujudkan': 1671, 'pengetahuan': 1672, 'candi?': 1673, 'digunakan?': 1674, 'lainnya': 1675}\n"
     ]
    }
   ],
   "source": [
    "from itertools import chain\n",
    "\n",
    "\n",
    "def build_vocab(seq_iter, reserved=[\"<pad>\", \"<unk>\", \"<sos>\", \"<eos>\"]):\n",
    "    vocab = {tok: idx for idx, tok in enumerate(reserved)}\n",
    "    for tok in chain.from_iterable(seq_iter):\n",
    "        if tok not in vocab:\n",
    "            vocab[tok] = len(vocab)\n",
    "    return vocab\n",
    "\n",
    "\n",
    "vocab_tok = build_vocab((s[\"tokens\"] for s in samples))\n",
    "vocab_ner = build_vocab((s[\"ner\"] for s in samples), reserved=[\"<pad>\", \"<unk>\"])\n",
    "vocab_srl = build_vocab((s[\"srl\"] for s in samples), reserved=[\"<pad>\", \"<unk>\"])\n",
    "vocab_q = build_vocab((s[\"q_toks\"] for s in samples))\n",
    "vocab_a = build_vocab((s[\"a_toks\"] for s in samples))\n",
    "\n",
    "vocab_typ = {\"isian\": 0, \"opsi\": 1, \"true_false\": 2}\n",
    "\n",
    "print(vocab_q)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d1a5b324",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-05-21 19:23:30.462039: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2025-05-21 19:23:30.468869: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-05-21 19:23:30.530246: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
      "2025-05-21 19:23:30.576136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1747830210.626662   68495 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1747830210.640357   68495 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "W0000 00:00:1747830210.732105   68495 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1747830210.732132   68495 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1747830210.732134   68495 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "W0000 00:00:1747830210.732136   68495 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
      "2025-05-21 19:23:30.744139: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "\n",
    "def encode(seq, vmap):  # token → id\n",
    "    return [vmap.get(t, vmap[\"<unk>\"]) for t in seq]\n",
    "\n",
    "\n",
    "MAX_SENT = max(len(s[\"tokens\"]) for s in samples)\n",
    "MAX_Q = max(len(s[\"q_toks\"]) for s in samples)\n",
    "MAX_A = max(len(s[\"a_toks\"]) for s in samples)\n",
    "\n",
    "X_tok = pad_sequences(\n",
    "    [encode(s[\"tokens\"], vocab_tok) for s in samples], maxlen=MAX_SENT, padding=\"post\"\n",
    ")\n",
    "X_ner = pad_sequences(\n",
    "    [encode(s[\"ner\"], vocab_ner) for s in samples], maxlen=MAX_SENT, padding=\"post\"\n",
    ")\n",
    "X_srl = pad_sequences(\n",
    "    [encode(s[\"srl\"], vocab_srl) for s in samples], maxlen=MAX_SENT, padding=\"post\"\n",
    ")\n",
    "\n",
    "# Decoder input = <sos> + target[:-1]\n",
    "dec_q_in = pad_sequences(\n",
    "    [[vocab_q[\"<sos>\"], *encode(s[\"q_toks\"][:-1], vocab_q)] for s in samples],\n",
    "    maxlen=MAX_Q,\n",
    "    padding=\"post\",\n",
    ")\n",
    "dec_q_out = pad_sequences(\n",
    "    [encode(s[\"q_toks\"], vocab_q) for s in samples], maxlen=MAX_Q, padding=\"post\"\n",
    ")\n",
    "\n",
    "dec_a_in = pad_sequences(\n",
    "    [[vocab_a[\"<sos>\"], *encode(s[\"a_toks\"][:-1], vocab_a)] for s in samples],\n",
    "    maxlen=MAX_A,\n",
    "    padding=\"post\",\n",
    ")\n",
    "dec_a_out = pad_sequences(\n",
    "    [encode(s[\"a_toks\"], vocab_a) for s in samples], maxlen=MAX_A, padding=\"post\"\n",
    ")\n",
    "y_type = np.array([vocab_typ[s[\"q_type\"]] for s in samples])\n",
    "\n",
    "MAX_SENT = max(len(s[\"tokens\"]) for s in samples)\n",
    "MAX_Q = max(len(s[\"q_toks\"]) for s in samples)\n",
    "MAX_A = max(len(s[\"a_toks\"]) for s in samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ff5bd85f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-05-21 19:23:32.948920: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"functional\"</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1mModel: \"functional\"\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
       "┃<span style=\"font-weight: bold\"> Layer (type)        </span>┃<span style=\"font-weight: bold\"> Output Shape      </span>┃<span style=\"font-weight: bold\">    Param # </span>┃<span style=\"font-weight: bold\"> Connected to      </span>┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
       "│ tok_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ ner_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ srl_in (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_tok       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)    │     <span style=\"color: #00af00; text-decoration-color: #00af00\">65,248</span> │ tok_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_ner       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>)    │        <span style=\"color: #00af00; text-decoration-color: #00af00\">560</span> │ ner_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_srl       │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">16</span>)    │        <span style=\"color: #00af00; text-decoration-color: #00af00\">496</span> │ srl_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]      │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ dec_q_in            │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">15</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>)        │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ concatenate         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">61</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>)    │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ embedding_tok[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Concatenate</span>)       │                   │            │ embedding_ner[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
       "│                     │                   │            │ embedding_srl[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]… │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ dec_a_in            │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>)         │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ -                 │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">InputLayer</span>)        │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_q_decoder │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">15</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)    │     <span style=\"color: #00af00; text-decoration-color: #00af00\">53,632</span> │ dec_q_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ encoder_lstm (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>) │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>),      │     <span style=\"color: #00af00; text-decoration-color: #00af00\">33,024</span> │ concatenate[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>),       │            │                   │\n",
       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>)]       │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_a_decoder │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">32</span>)     │     <span style=\"color: #00af00; text-decoration-color: #00af00\">21,632</span> │ dec_a_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ lstm_q_decoder      │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">15</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>),  │     <span style=\"color: #00af00; text-decoration-color: #00af00\">24,832</span> │ embedding_q_deco… │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>)              │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>),       │            │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>)]       │            │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ not_equal           │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">15</span>)        │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ dec_q_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>)          │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ lstm_a_decoder      │ [(<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>),   │     <span style=\"color: #00af00; text-decoration-color: #00af00\">24,832</span> │ embedding_a_deco… │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">LSTM</span>)              │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>),       │            │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
       "│                     │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>)]       │            │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ not_equal_1         │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>)         │          <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │ dec_a_in[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]    │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">NotEqual</span>)          │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ q_output            │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">15</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1676</span>)  │    <span style=\"color: #00af00; text-decoration-color: #00af00\">108,940</span> │ lstm_q_decoder[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>)   │                   │            │ not_equal[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>]   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ a_output            │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">2</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">676</span>)    │     <span style=\"color: #00af00; text-decoration-color: #00af00\">43,940</span> │ lstm_a_decoder[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>… │\n",
       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">TimeDistributed</span>)   │                   │            │ not_equal_1[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>] │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ type_output (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">3</span>)         │        <span style=\"color: #00af00; text-decoration-color: #00af00\">195</span> │ encoder_lstm[<span style=\"color: #00af00; text-decoration-color: #00af00\">0</span>][<span style=\"color: #00af00; text-decoration-color: #00af00\">…</span> │\n",
       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
       "</pre>\n"
      ],
      "text/plain": [
       "┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)       \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape     \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m   Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to     \u001b[0m\u001b[1m \u001b[0m┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
       "│ tok_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ ner_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ srl_in (\u001b[38;5;33mInputLayer\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_tok       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m, \u001b[38;5;34m32\u001b[0m)    │     \u001b[38;5;34m65,248\u001b[0m │ tok_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_ner       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m, \u001b[38;5;34m16\u001b[0m)    │        \u001b[38;5;34m560\u001b[0m │ ner_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_srl       │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m, \u001b[38;5;34m16\u001b[0m)    │        \u001b[38;5;34m496\u001b[0m │ srl_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]      │\n",
       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ dec_q_in            │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m15\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
       "│ (\u001b[38;5;33mInputLayer\u001b[0m)        │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ concatenate         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m61\u001b[0m, \u001b[38;5;34m64\u001b[0m)    │          \u001b[38;5;34m0\u001b[0m │ embedding_tok[\u001b[38;5;34m0\u001b[0m]… │\n",
       "│ (\u001b[38;5;33mConcatenate\u001b[0m)       │                   │            │ embedding_ner[\u001b[38;5;34m0\u001b[0m]… │\n",
       "│                     │                   │            │ embedding_srl[\u001b[38;5;34m0\u001b[0m]… │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ dec_a_in            │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m)         │          \u001b[38;5;34m0\u001b[0m │ -                 │\n",
       "│ (\u001b[38;5;33mInputLayer\u001b[0m)        │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_q_decoder │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m15\u001b[0m, \u001b[38;5;34m32\u001b[0m)    │     \u001b[38;5;34m53,632\u001b[0m │ dec_q_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ encoder_lstm (\u001b[38;5;33mLSTM\u001b[0m) │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m),      │     \u001b[38;5;34m33,024\u001b[0m │ concatenate[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m),       │            │                   │\n",
       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m)]       │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ embedding_a_decoder │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m, \u001b[38;5;34m32\u001b[0m)     │     \u001b[38;5;34m21,632\u001b[0m │ dec_a_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
       "│ (\u001b[38;5;33mEmbedding\u001b[0m)         │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ lstm_q_decoder      │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m15\u001b[0m, \u001b[38;5;34m64\u001b[0m),  │     \u001b[38;5;34m24,832\u001b[0m │ embedding_q_deco… │\n",
       "│ (\u001b[38;5;33mLSTM\u001b[0m)              │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m),       │            │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m)]       │            │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ not_equal           │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m15\u001b[0m)        │          \u001b[38;5;34m0\u001b[0m │ dec_q_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
       "│ (\u001b[38;5;33mNotEqual\u001b[0m)          │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ lstm_a_decoder      │ [(\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m, \u001b[38;5;34m64\u001b[0m),   │     \u001b[38;5;34m24,832\u001b[0m │ embedding_a_deco… │\n",
       "│ (\u001b[38;5;33mLSTM\u001b[0m)              │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m),       │            │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
       "│                     │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m)]       │            │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ not_equal_1         │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m)         │          \u001b[38;5;34m0\u001b[0m │ dec_a_in[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]    │\n",
       "│ (\u001b[38;5;33mNotEqual\u001b[0m)          │                   │            │                   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ q_output            │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m15\u001b[0m, \u001b[38;5;34m1676\u001b[0m)  │    \u001b[38;5;34m108,940\u001b[0m │ lstm_q_decoder[\u001b[38;5;34m0\u001b[0m… │\n",
       "│ (\u001b[38;5;33mTimeDistributed\u001b[0m)   │                   │            │ not_equal[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m]   │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ a_output            │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m2\u001b[0m, \u001b[38;5;34m676\u001b[0m)    │     \u001b[38;5;34m43,940\u001b[0m │ lstm_a_decoder[\u001b[38;5;34m0\u001b[0m… │\n",
       "│ (\u001b[38;5;33mTimeDistributed\u001b[0m)   │                   │            │ not_equal_1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
       "├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
       "│ type_output (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m3\u001b[0m)         │        \u001b[38;5;34m195\u001b[0m │ encoder_lstm[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
       "└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">377,331</span> (1.44 MB)\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m377,331\u001b[0m (1.44 MB)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">377,331</span> (1.44 MB)\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m377,331\u001b[0m (1.44 MB)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow.keras.layers import (\n",
    "    Input,\n",
    "    Embedding,\n",
    "    LSTM,\n",
    "    Concatenate,\n",
    "    Dense,\n",
    "    TimeDistributed,\n",
    ")\n",
    "from tensorflow.keras.models import Model\n",
    "\n",
    "# ---- constants ---------------------------------------------------\n",
    "d_tok = 32  # token embedding dim\n",
    "d_tag = 16  # NER / SRL embedding dim\n",
    "units = 64\n",
    "\n",
    "# ---- encoder -----------------------------------------------------\n",
    "inp_tok = Input((MAX_SENT,), name=\"tok_in\")\n",
    "inp_ner = Input((MAX_SENT,), name=\"ner_in\")\n",
    "inp_srl = Input((MAX_SENT,), name=\"srl_in\")\n",
    "\n",
    "# make ALL streams mask the same way (here: no masking,\n",
    "# we'll just pad with 0s and let the LSTM ignore them)\n",
    "emb_tok = Embedding(len(vocab_tok), d_tok, mask_zero=False, name=\"embedding_tok\")(\n",
    "    inp_tok\n",
    ")\n",
    "emb_ner = Embedding(len(vocab_ner), d_tag, mask_zero=False, name=\"embedding_ner\")(\n",
    "    inp_ner\n",
    ")\n",
    "emb_srl = Embedding(len(vocab_srl), d_tag, mask_zero=False, name=\"embedding_srl\")(\n",
    "    inp_srl\n",
    ")\n",
    "\n",
    "enc_concat = Concatenate()([emb_tok, emb_ner, emb_srl])\n",
    "enc_out, state_h, state_c = LSTM(units, return_state=True, name=\"encoder_lstm\")(\n",
    "    enc_concat\n",
    ")\n",
    "\n",
    "\n",
    "# ---------- DECODER : Question ----------\n",
    "dec_q_inp = Input(shape=(MAX_Q,), name=\"dec_q_in\")\n",
    "dec_emb_q = Embedding(len(vocab_q), d_tok, mask_zero=True, name=\"embedding_q_decoder\")(\n",
    "    dec_q_inp\n",
    ")\n",
    "dec_q, _, _ = LSTM(\n",
    "    units, return_state=True, return_sequences=True, name=\"lstm_q_decoder\"\n",
    ")(dec_emb_q, initial_state=[state_h, state_c])\n",
    "q_out = TimeDistributed(\n",
    "    Dense(len(vocab_q), activation=\"softmax\", name=\"dense_q_output\"), name=\"q_output\"\n",
    ")(dec_q)\n",
    "\n",
    "# ---------- DECODER : Answer ----------\n",
    "dec_a_inp = Input(shape=(MAX_A,), name=\"dec_a_in\")\n",
    "dec_emb_a = Embedding(len(vocab_a), d_tok, mask_zero=True, name=\"embedding_a_decoder\")(\n",
    "    dec_a_inp\n",
    ")\n",
    "dec_a, _, _ = LSTM(\n",
    "    units, return_state=True, return_sequences=True, name=\"lstm_a_decoder\"\n",
    ")(dec_emb_a, initial_state=[state_h, state_c])\n",
    "a_out = TimeDistributed(\n",
    "    Dense(len(vocab_a), activation=\"softmax\", name=\"dense_a_output\"), name=\"a_output\"\n",
    ")(dec_a)\n",
    "\n",
    "# ---------- CLASSIFIER : Question Type ----------\n",
    "type_out = Dense(len(vocab_typ), activation=\"softmax\", name=\"type_output\")(enc_out)\n",
    "\n",
    "model = Model(\n",
    "    inputs=[inp_tok, inp_ner, inp_srl, dec_q_inp, dec_a_inp],\n",
    "    outputs=[q_out, a_out, type_out],\n",
    ")\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fece1ae9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 60ms/step - a_output_loss: 6.3756 - a_output_sparse_categorical_accuracy: 0.2884 - loss: 14.0442 - q_output_loss: 7.3841 - q_output_sparse_categorical_accuracy: 0.0147 - type_output_accuracy: 0.6155 - type_output_loss: 0.9312 - val_a_output_loss: 5.5567 - val_a_output_sparse_categorical_accuracy: 0.5000 - val_loss: 12.7159 - val_q_output_loss: 7.0687 - val_q_output_sparse_categorical_accuracy: 0.0466 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.3711\n",
      "Epoch 2/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 37ms/step - a_output_loss: 5.0151 - a_output_sparse_categorical_accuracy: 0.5000 - loss: 11.9122 - q_output_loss: 6.6580 - q_output_sparse_categorical_accuracy: 0.0642 - type_output_accuracy: 0.7221 - type_output_loss: 0.7869 - val_a_output_loss: 4.4353 - val_a_output_sparse_categorical_accuracy: 0.5000 - val_loss: 10.9637 - val_q_output_loss: 6.5242 - val_q_output_sparse_categorical_accuracy: 0.0746 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4294\n",
      "Epoch 3/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 39ms/step - a_output_loss: 3.5820 - a_output_sparse_categorical_accuracy: 0.5000 - loss: 9.5962 - q_output_loss: 5.7778 - q_output_sparse_categorical_accuracy: 0.0729 - type_output_accuracy: 0.6948 - type_output_loss: 0.7931 - val_a_output_loss: 4.1995 - val_a_output_sparse_categorical_accuracy: 0.5000 - val_loss: 10.5530 - val_q_output_loss: 6.4072 - val_q_output_sparse_categorical_accuracy: 0.0611 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4629\n",
      "Epoch 4/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 36ms/step - a_output_loss: 3.2052 - a_output_sparse_categorical_accuracy: 0.5000 - loss: 8.8977 - q_output_loss: 5.4687 - q_output_sparse_categorical_accuracy: 0.0686 - type_output_accuracy: 0.7130 - type_output_loss: 0.7553 - val_a_output_loss: 4.1926 - val_a_output_sparse_categorical_accuracy: 0.5000 - val_loss: 10.5574 - val_q_output_loss: 6.4193 - val_q_output_sparse_categorical_accuracy: 0.0667 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.5336\n",
      "Epoch 5/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 38ms/step - a_output_loss: 3.0916 - a_output_sparse_categorical_accuracy: 0.5000 - loss: 8.7251 - q_output_loss: 5.4010 - q_output_sparse_categorical_accuracy: 0.0668 - type_output_accuracy: 0.7096 - type_output_loss: 0.7808 - val_a_output_loss: 4.1580 - val_a_output_sparse_categorical_accuracy: 0.5000 - val_loss: 10.5627 - val_q_output_loss: 6.4720 - val_q_output_sparse_categorical_accuracy: 0.0727 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.5060\n",
      "Epoch 6/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 39ms/step - a_output_loss: 2.9724 - a_output_sparse_categorical_accuracy: 0.5000 - loss: 8.5485 - q_output_loss: 5.3620 - q_output_sparse_categorical_accuracy: 0.0727 - type_output_accuracy: 0.7280 - type_output_loss: 0.7255 - val_a_output_loss: 4.1215 - val_a_output_sparse_categorical_accuracy: 0.5000 - val_loss: 10.5385 - val_q_output_loss: 6.4986 - val_q_output_sparse_categorical_accuracy: 0.0755 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4595\n",
      "Epoch 7/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 42ms/step - a_output_loss: 2.9038 - a_output_sparse_categorical_accuracy: 0.5003 - loss: 8.4598 - q_output_loss: 5.3291 - q_output_sparse_categorical_accuracy: 0.0749 - type_output_accuracy: 0.7129 - type_output_loss: 0.7501 - val_a_output_loss: 4.0746 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.5159 - val_q_output_loss: 6.5121 - val_q_output_sparse_categorical_accuracy: 0.0755 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4771\n",
      "Epoch 8/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 36ms/step - a_output_loss: 2.8508 - a_output_sparse_categorical_accuracy: 0.5607 - loss: 8.3884 - q_output_loss: 5.3055 - q_output_sparse_categorical_accuracy: 0.0746 - type_output_accuracy: 0.6980 - type_output_loss: 0.7779 - val_a_output_loss: 4.0781 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.4898 - val_q_output_loss: 6.4878 - val_q_output_sparse_categorical_accuracy: 0.0755 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4642\n",
      "Epoch 9/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 36ms/step - a_output_loss: 2.8178 - a_output_sparse_categorical_accuracy: 0.5604 - loss: 8.3014 - q_output_loss: 5.2522 - q_output_sparse_categorical_accuracy: 0.0745 - type_output_accuracy: 0.7005 - type_output_loss: 0.7710 - val_a_output_loss: 4.0977 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.4146 - val_q_output_loss: 6.3866 - val_q_output_sparse_categorical_accuracy: 0.0760 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4621\n",
      "Epoch 10/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 36ms/step - a_output_loss: 2.7852 - a_output_sparse_categorical_accuracy: 0.5605 - loss: 8.2036 - q_output_loss: 5.1914 - q_output_sparse_categorical_accuracy: 0.0788 - type_output_accuracy: 0.7009 - type_output_loss: 0.7644 - val_a_output_loss: 4.1185 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.3418 - val_q_output_loss: 6.2919 - val_q_output_sparse_categorical_accuracy: 0.0788 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4529\n",
      "Epoch 11/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 36ms/step - a_output_loss: 2.7896 - a_output_sparse_categorical_accuracy: 0.5600 - loss: 8.1496 - q_output_loss: 5.1346 - q_output_sparse_categorical_accuracy: 0.0787 - type_output_accuracy: 0.7092 - type_output_loss: 0.7535 - val_a_output_loss: 4.1562 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.3775 - val_q_output_loss: 6.2831 - val_q_output_sparse_categorical_accuracy: 0.0746 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4971\n",
      "Epoch 12/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 37ms/step - a_output_loss: 2.7965 - a_output_sparse_categorical_accuracy: 0.5548 - loss: 8.0476 - q_output_loss: 5.0257 - q_output_sparse_categorical_accuracy: 0.0787 - type_output_accuracy: 0.7174 - type_output_loss: 0.7501 - val_a_output_loss: 4.1949 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.3898 - val_q_output_loss: 6.2698 - val_q_output_sparse_categorical_accuracy: 0.0890 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4722\n",
      "Epoch 13/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 39ms/step - a_output_loss: 2.7583 - a_output_sparse_categorical_accuracy: 0.5531 - loss: 7.9879 - q_output_loss: 5.0045 - q_output_sparse_categorical_accuracy: 0.0809 - type_output_accuracy: 0.7135 - type_output_loss: 0.7392 - val_a_output_loss: 4.2232 - val_a_output_sparse_categorical_accuracy: 0.5245 - val_loss: 10.4226 - val_q_output_loss: 6.2835 - val_q_output_sparse_categorical_accuracy: 0.0876 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.4389\n",
      "Epoch 14/30\n",
      "\u001b[1m21/21\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 40ms/step - a_output_loss: 2.7988 - a_output_sparse_categorical_accuracy: 0.5453 - loss: 7.9487 - q_output_loss: 4.9337 - q_output_sparse_categorical_accuracy: 0.0834 - type_output_accuracy: 0.7349 - type_output_loss: 0.7180 - val_a_output_loss: 4.2406 - val_a_output_sparse_categorical_accuracy: 0.5490 - val_loss: 10.4701 - val_q_output_loss: 6.2967 - val_q_output_sparse_categorical_accuracy: 0.0858 - val_type_output_accuracy: 0.8531 - val_type_output_loss: 0.5188\n"
     ]
    }
   ],
   "source": [
    "losses = {\n",
    "    \"q_output\": \"sparse_categorical_crossentropy\",\n",
    "    \"a_output\": \"sparse_categorical_crossentropy\",\n",
    "    \"type_output\": \"sparse_categorical_crossentropy\",\n",
    "}\n",
    "loss_weights = {\"q_output\": 1.0, \"a_output\": 1.0, \"type_output\": 0.3}\n",
    "\n",
    "model.compile(\n",
    "    optimizer=\"adam\",\n",
    "    loss=losses,\n",
    "    loss_weights=loss_weights,\n",
    "    metrics={\n",
    "        \"q_output\": \"sparse_categorical_accuracy\",\n",
    "        \"a_output\": \"sparse_categorical_accuracy\",\n",
    "        \"type_output\": \"accuracy\",\n",
    "    },\n",
    ")\n",
    "\n",
    "history = model.fit(\n",
    "    [X_tok, X_ner, X_srl, dec_q_in, dec_a_in],\n",
    "    [dec_q_out, dec_a_out, y_type],\n",
    "    validation_split=0.1,\n",
    "    epochs=30,\n",
    "    batch_size=64,\n",
    "    callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True)],\n",
    "    verbose=1,\n",
    ")\n",
    "\n",
    "model.save(\"full_seq2seq.keras\")\n",
    "\n",
    "import json\n",
    "import pickle\n",
    "\n",
    "# def save_vocab(vocab, path):\n",
    "#     with open(path, \"w\", encoding=\"utf-8\") as f:\n",
    "#         json.dump(vocab, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "# # Simpan semua vocab\n",
    "# save_vocab(vocab_tok, \"vocab_tok.json\")\n",
    "# save_vocab(vocab_ner, \"vocab_ner.json\")\n",
    "# save_vocab(vocab_srl, \"vocab_srl.json\")\n",
    "# save_vocab(vocab_q,   \"vocab_q.json\")\n",
    "# save_vocab(vocab_a,   \"vocab_a.json\")\n",
    "# save_vocab(vocab_typ, \"vocab_typ.json\")\n",
    "\n",
    "\n",
    "def save_vocab_pkl(vocab, path):\n",
    "    with open(path, \"wb\") as f:\n",
    "        pickle.dump(vocab, f)\n",
    "\n",
    "\n",
    "# Simpan semua vocab\n",
    "save_vocab_pkl(vocab_tok, \"vocab_tok.pkl\")\n",
    "save_vocab_pkl(vocab_ner, \"vocab_ner.pkl\")\n",
    "save_vocab_pkl(vocab_srl, \"vocab_srl.pkl\")\n",
    "save_vocab_pkl(vocab_q, \"vocab_q.pkl\")\n",
    "save_vocab_pkl(vocab_a, \"vocab_a.pkl\")\n",
    "save_vocab_pkl(vocab_typ, \"vocab_typ.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3355c0c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import pickle\n",
    "from tensorflow.keras.models import load_model, Model\n",
    "from tensorflow.keras.layers import Input, Concatenate\n",
    "\n",
    "# === Load Model Utama ===\n",
    "model = load_model(\"full_seq2seq.keras\")\n",
    "\n",
    "\n",
    "# === Load Vocabulary dari .pkl ===\n",
    "def load_vocab(path):\n",
    "    with open(path, \"rb\") as f:\n",
    "        return pickle.load(f)\n",
    "\n",
    "\n",
    "vocab_tok = load_vocab(\"vocab_tok.pkl\")\n",
    "vocab_ner = load_vocab(\"vocab_ner.pkl\")\n",
    "vocab_srl = load_vocab(\"vocab_srl.pkl\")\n",
    "vocab_q = load_vocab(\"vocab_q.pkl\")\n",
    "vocab_a = load_vocab(\"vocab_a.pkl\")\n",
    "vocab_typ = load_vocab(\"vocab_typ.pkl\")\n",
    "\n",
    "inv_vocab_q = {v: k for k, v in vocab_q.items()}\n",
    "inv_vocab_a = {v: k for k, v in vocab_a.items()}\n",
    "\n",
    "# === Build Encoder Model ===\n",
    "MAX_SENT = model.input_shape[0][1]  # Ambil shape dari model yang diload\n",
    "MAX_Q = model.input_shape[3][1]  # Max length for question\n",
    "MAX_A = model.input_shape[4][1]  # Max length for answer\n",
    "\n",
    "inp_tok_g = Input(shape=(MAX_SENT,), name=\"tok_in_g\")\n",
    "inp_ner_g = Input(shape=(MAX_SENT,), name=\"ner_in_g\")\n",
    "inp_srl_g = Input(shape=(MAX_SENT,), name=\"srl_in_g\")\n",
    "\n",
    "emb_tok = model.get_layer(\"embedding_tok\").call(inp_tok_g)\n",
    "emb_ner = model.get_layer(\"embedding_ner\").call(inp_ner_g)\n",
    "emb_srl = model.get_layer(\"embedding_srl\").call(inp_srl_g)\n",
    "\n",
    "enc_concat = Concatenate(name=\"concat_encoder\")([emb_tok, emb_ner, emb_srl])\n",
    "\n",
    "encoder_lstm = model.get_layer(\"encoder_lstm\")\n",
    "enc_out, state_h, state_c = encoder_lstm(enc_concat)\n",
    "\n",
    "# Create encoder model with full output including enc_out\n",
    "encoder_model = Model(\n",
    "    inputs=[inp_tok_g, inp_ner_g, inp_srl_g],\n",
    "    outputs=[enc_out, state_h, state_c],\n",
    "    name=\"encoder_model\",\n",
    ")\n",
    "\n",
    "# === Build Decoder for Question ===\n",
    "dec_q_inp = Input(shape=(1,), name=\"dec_q_in\")\n",
    "dec_emb_q = model.get_layer(\"embedding_q_decoder\").call(dec_q_inp)\n",
    "\n",
    "state_h_dec = Input(shape=(units,), name=\"state_h_dec\")\n",
    "state_c_dec = Input(shape=(units,), name=\"state_c_dec\")\n",
    "\n",
    "lstm_decoder_q = model.get_layer(\"lstm_q_decoder\")\n",
    "\n",
    "dec_out_q, state_h_q, state_c_q = lstm_decoder_q(\n",
    "    dec_emb_q, initial_state=[state_h_dec, state_c_dec]\n",
    ")\n",
    "\n",
    "q_time_dist_layer = model.get_layer(\"q_output\")\n",
    "dense_q = q_time_dist_layer.layer\n",
    "q_output = dense_q(dec_out_q)\n",
    "\n",
    "decoder_q = Model(\n",
    "    inputs=[dec_q_inp, state_h_dec, state_c_dec],\n",
    "    outputs=[q_output, state_h_q, state_c_q],\n",
    "    name=\"decoder_question_model\",\n",
    ")\n",
    "\n",
    "# === Build Decoder for Answer ===\n",
    "dec_a_inp = Input(shape=(1,), name=\"dec_a_in\")\n",
    "dec_emb_a = model.get_layer(\"embedding_a_decoder\").call(dec_a_inp)\n",
    "\n",
    "state_h_a = Input(shape=(units,), name=\"state_h_a\")\n",
    "state_c_a = Input(shape=(units,), name=\"state_c_a\")\n",
    "\n",
    "lstm_decoder_a = model.get_layer(\"lstm_a_decoder\")\n",
    "\n",
    "dec_out_a, state_h_a_out, state_c_a_out = lstm_decoder_a(\n",
    "    dec_emb_a, initial_state=[state_h_a, state_c_a]\n",
    ")\n",
    "\n",
    "a_time_dist_layer = model.get_layer(\"a_output\")\n",
    "dense_a = a_time_dist_layer.layer\n",
    "a_output = dense_a(dec_out_a)\n",
    "\n",
    "decoder_a = Model(\n",
    "    inputs=[dec_a_inp, state_h_a, state_c_a],\n",
    "    outputs=[a_output, state_h_a_out, state_c_a_out],\n",
    "    name=\"decoder_answer_model\",\n",
    ")\n",
    "\n",
    "# === Build Classifier for Question Type ===\n",
    "type_dense = model.get_layer(\"type_output\")\n",
    "type_out = type_dense(enc_out)\n",
    "\n",
    "classifier_model = Model(\n",
    "    inputs=[inp_tok_g, inp_ner_g, inp_srl_g], outputs=type_out, name=\"classifier_model\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d406e6ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generated Question: apa apa yang di\n",
      "Generated Answer  : true\n",
      "Question Type     : isian\n"
     ]
    }
   ],
   "source": [
    "def encode(seq, vmap):\n",
    "    return [vmap.get(tok, vmap[\"<unk>\"]) for tok in seq]\n",
    "\n",
    "\n",
    "def encode_and_pad(seq, vmap, max_len=MAX_SENT):\n",
    "    encoded = [vmap.get(tok, vmap[\"<unk>\"]) for tok in seq]\n",
    "    # Pad with vocab[\"<pad>\"] to the right if sequence is shorter than max_len\n",
    "    padded = encoded + [vmap[\"<pad>\"]] * (max_len - len(encoded))\n",
    "    return padded[:max_len]  # Ensure it doesn't exceed max_len\n",
    "\n",
    "\n",
    "def greedy_decode(tokens, ner, srl, max_q=20, max_a=10):\n",
    "    # --- encode encoder inputs -------------------------------------------\n",
    "    if isinstance(tokens, np.ndarray):\n",
    "        enc_tok = tokens\n",
    "        enc_ner = ner\n",
    "        enc_srl = srl\n",
    "    else:\n",
    "        enc_tok = np.array([encode_and_pad(tokens, vocab_tok, MAX_SENT)])\n",
    "        enc_ner = np.array([encode_and_pad(ner, vocab_ner, MAX_SENT)])\n",
    "        enc_srl = np.array([encode_and_pad(srl, vocab_srl, MAX_SENT)])\n",
    "\n",
    "    # --- Get encoder outputs ---\n",
    "    enc_out, h, c = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)\n",
    "\n",
    "    # QUESTION Decoding\n",
    "    tgt = np.array([[vocab_q[\"<sos>\"]]])\n",
    "    question_ids = []\n",
    "    for _ in range(max_q):\n",
    "        logits, h, c = decoder_q.predict([tgt, h, c], verbose=0)\n",
    "        next_id = int(logits[0, 0].argmax())  # Get the predicted token ID\n",
    "        if next_id == vocab_q[\"<eos>\"]:\n",
    "            break\n",
    "        question_ids.append(next_id)\n",
    "        tgt = np.array([[next_id]])  # Feed the predicted token back as input\n",
    "\n",
    "    # ANSWER Decoding - use encoder outputs again for fresh state\n",
    "    _, h, c = encoder_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)\n",
    "    tgt = np.array([[vocab_a[\"<sos>\"]]])\n",
    "    answer_ids = []\n",
    "    for _ in range(max_a):\n",
    "        logits, h, c = decoder_a.predict([tgt, h, c], verbose=0)\n",
    "        next_id = int(logits[0, 0].argmax())\n",
    "        if next_id == vocab_a[\"<eos>\"]:\n",
    "            break\n",
    "        answer_ids.append(next_id)\n",
    "        tgt = np.array([[next_id]])\n",
    "\n",
    "    # Question Type\n",
    "    qtype_logits = classifier_model.predict([enc_tok, enc_ner, enc_srl], verbose=0)\n",
    "    qtype_id = int(qtype_logits.argmax())\n",
    "\n",
    "    # Final output\n",
    "    question = [inv_vocab_q.get(i, \"<unk>\") for i in question_ids]\n",
    "    answer = [inv_vocab_a.get(i, \"<unk>\") for i in answer_ids]\n",
    "    q_type = [k for k, v in vocab_typ.items() if v == qtype_id][0]\n",
    "\n",
    "    return question, answer, q_type\n",
    "\n",
    "\n",
    "def test_model():\n",
    "    test_data = {\n",
    "        \"tokens\": [\n",
    "            \"joko\",\n",
    "            \"opik\",\n",
    "            \"widodo\",\n",
    "            \"lahir\",\n",
    "            \"pada\",\n",
    "            \"27\",\n",
    "            \"maret\",\n",
    "            \"1992\",\n",
    "            \"di\",\n",
    "            \"solo\",\n",
    "        ],\n",
    "        \"ner\": [\n",
    "            \"B-PER\",\n",
    "            \"I-PER\",\n",
    "            \"I-PER\",\n",
    "            \"V\",\n",
    "            \"O\",\n",
    "            \"B-DATE\",\n",
    "            \"I-DATE\",\n",
    "            \"I-DATE\",\n",
    "            \"O\",\n",
    "            \"B-LOC\",\n",
    "        ],\n",
    "        \"srl\": [\n",
    "            \"ARG0\",\n",
    "            \"ARG0\",\n",
    "            \"ARG0\",\n",
    "            \"V\",\n",
    "            \"O\",\n",
    "            \"ARGM-TMP\",\n",
    "            \"ARGM-TMP\",\n",
    "            \"ARGM-TMP\",\n",
    "            \"O\",\n",
    "            \"ARGM-LOC\",\n",
    "        ],\n",
    "    }\n",
    "    # tokens = [\n",
    "    #     \"soekarno\",\n",
    "    #     \"membacakan\",\n",
    "    #     \"teks\",\n",
    "    #     \"proklamasi\",\n",
    "    #     \"pada\",\n",
    "    #     \"17\",\n",
    "    #     \"agustus\",\n",
    "    #     \"1945\",\n",
    "    # ]\n",
    "    # ner_tags = [\"B-PER\", \"O\", \"O\", \"O\", \"O\", \"B-DATE\", \"I-DATE\", \"I-DATE\"]\n",
    "    # srl_tags = [\"ARG0\", \"V\", \"ARG1\", \"ARG1\", \"O\", \"ARGM-TMP\", \"ARGM-TMP\", \"ARGM-TMP\"]\n",
    "\n",
    "    question, answer, q_type = greedy_decode(\n",
    "        test_data[\"tokens\"], test_data[\"ner\"], test_data[\"srl\"]\n",
    "    )\n",
    "    print(f\"Generated Question: {' '.join(question)}\")\n",
    "    print(f\"Generated Answer  : {' '.join(answer)}\")\n",
    "    print(f\"Question Type     : {q_type}\")\n",
    "\n",
    "\n",
    "test_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5adde3c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:5 out of the last 11 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x7f33997cab00> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BLEU (Question) : 1.91%\n",
      "BLEU (Answer)   : 11.00%\n",
      "ROUGE-1         : 12.38%\n",
      "ROUGE-L         : 12.38%\n"
     ]
    }
   ],
   "source": [
    "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
    "from rouge_score import rouge_scorer\n",
    "\n",
    "smoothie = SmoothingFunction().method4\n",
    "scorer = rouge_scorer.RougeScorer([\"rouge1\", \"rougeL\"], use_stemmer=True)\n",
    "\n",
    "\n",
    "# Helper to strip special ids\n",
    "def strip_special(ids, vocab):\n",
    "    pad = vocab[\"<pad>\"] if \"<pad>\" in vocab else None\n",
    "    eos = vocab[\"<eos>\"]\n",
    "    return [i for i in ids if i not in (pad, eos)]\n",
    "\n",
    "\n",
    "def ids_to_text(ids, inv_vocab):\n",
    "    return \" \".join(inv_vocab[i] for i in ids)\n",
    "\n",
    "\n",
    "# ---- evaluation over a set of indices ----\n",
    "import random\n",
    "\n",
    "\n",
    "def evaluate(indices=None):\n",
    "    if indices is None:\n",
    "        indices = random.sample(range(len(X_tok)), k=min(100, len(X_tok)))\n",
    "\n",
    "    bleu_scores_q, bleu_scores_a = [], [], \n",
    "    rou1, rouL = [], []\n",
    "\n",
    "    for idx in indices:\n",
    "        # Ground truth\n",
    "        gt_q = strip_special(dec_q_out[idx], vocab_q)\n",
    "        gt_a = strip_special(dec_a_out[idx], vocab_a)\n",
    "        \n",
    "        # Prediction\n",
    "        q_pred, a_pred, _ = greedy_decode(\n",
    "            X_tok[idx : idx + 1], X_ner[idx : idx + 1], X_srl[idx : idx + 1]\n",
    "        )\n",
    "\n",
    "        # BLEU on question tokens\n",
    "        bleu_scores_q.append(\n",
    "            sentence_bleu(\n",
    "                [[inv_vocab_q[i] for i in gt_q]], q_pred, smoothing_function=smoothie\n",
    "            )\n",
    "        )\n",
    "\n",
    "        # BLEU on answer tokens\n",
    "        bleu_scores_a.append(\n",
    "            sentence_bleu(\n",
    "                [[inv_vocab_a[i] for i in gt_a]], a_pred, smoothing_function=smoothie\n",
    "            )\n",
    "        )\n",
    "\n",
    "        # ROUGE on question strings\n",
    "        r = scorer.score(ids_to_text(gt_q, inv_vocab_q), \" \".join(q_pred))\n",
    "        rou1.append(r[\"rouge1\"].fmeasure)\n",
    "        rouL.append(r[\"rougeL\"].fmeasure)\n",
    "\n",
    "    print(f\"BLEU (Question) : {np.mean(bleu_scores_q) * 100:.2f}%\")\n",
    "    print(f\"BLEU (Answer)   : {np.mean(bleu_scores_a) * 100:.2f}%\")\n",
    "    print(f\"ROUGE-1         : {np.mean(rou1) * 100:.2f}%\")\n",
    "    print(f\"ROUGE-L         : {np.mean(rouL) * 100:.2f}%\")\n",
    "    \n",
    "evaluate()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}