feat: adding question generation

This commit is contained in:
akhdanre 2025-04-27 10:31:11 +07:00
parent 1c270d4e75
commit 763879142e
22 changed files with 1166110 additions and 310 deletions

File diff suppressed because one or more lines are too long

Binary file not shown.

54
QC/convert_dts.py Normal file
View File

@ -0,0 +1,54 @@
import json
import csv
from pathlib import Path
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
"""
Konversi data JSON (field: tokens, ner, srl, ) TSV token\tNER\tSRL.
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
"""
# ---------------------------------------------------------------------
# 1. Baca semua record dari JSON
# ---------------------------------------------------------------------
with open(json_path, encoding="utf-8") as f:
records = json.load(f)
# ---------------------------------------------------------------------
# 2. Tulis ke TSV, sambil mendeteksi duplikat
# ---------------------------------------------------------------------
seen_sentences: set[tuple[str, ...]] = set() # simpan tuple tokens unik
with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
for rec in records:
tokens = rec["tokens"]
ner_tags = rec["ner"]
srl_tags = rec["srl"]
# -- cek panjang sama
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
raise ValueError("tokens, ner, dan srl harus punya panjang sama")
# -- cek duplikat kalimat
key = tuple(tokens) # tuple hash-able
if key in seen_sentences: # sudah pernah ditulis → skip
continue
seen_sentences.add(key)
# -- tulis baris token, NER, SRL
for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
writer.writerow([tok, ner, srl])
# -- baris kosong pemisah antar-kalimat
writer.writerow([])
print(f"✔️ TSV selesai, simpan di: {tsv_path}")
# ---------------------------------------------------------------------------
# CONTOH PEMAKAIAN
# ---------------------------------------------------------------------------
if __name__ == "__main__":
json_to_tsv("QC/dataset_combination.json", "QC/output.tsv")

445862
QC/dataset_combination.json Normal file

File diff suppressed because it is too large Load Diff

914
QC/dataset_qc.json Normal file
View File

@ -0,0 +1,914 @@
[
{
"tokens": [
"R.",
"Soewardi",
"Soerjaningrat",
"adalah",
"putra",
"GPH",
"Soerjaningrat",
"dan",
"cucu",
"Pakualam",
"III",
"."
],
"ner": [
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-PER",
"I-PER",
"O",
"O",
"B-PER",
"I-PER",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"O"
],
"question": "___ adalah putra GPH Soerjaningrat dan cucu Pakualam III.",
"answer": "R. Soewardi Soerjaningrat",
"type": "isian"
},
{
"tokens": ["Ia", "lantas", "diterima", "belajar", "di", "STOVIA", "."],
"ner": ["O", "O", "O", "O", "O", "B-ORG", "O"],
"srl": ["ARG0", "O", "V", "ARG1", "O", "ARGM-LOC", "O"],
"question": "Ia diterima belajar di ___.",
"answer": "STOVIA",
"type": "isian"
},
{
"tokens": [
"Ia",
"bersama",
"Douwes",
"Dekker",
"dan",
"dr.",
"Cipto",
"Mangoenkoesoemo",
"lantas",
"mendirikan",
"Indische",
"Partij",
"pada",
"25",
"Desember",
"1912",
"."
],
"ner": [
"O",
"O",
"B-PER",
"I-PER",
"O",
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-ORG",
"I-ORG",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"O",
"V",
"ARG1",
"ARG1",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Ia bersama Douwes Dekker dan dr. Cipto Mangoenkoesoemo lantas mendirikan ___ pada 25 Desember 1912.",
"answer": "Indische Partij",
"type": "isian"
},
{
"tokens": [
"Indische",
"Partij",
"didirikan",
"pada",
"25",
"Desember",
"1912",
"."
],
"ner": ["B-ORG", "I-ORG", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"],
"srl": ["ARG1", "ARG1", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O"],
"question": "Indische Partij didirikan pada tanggal ___.",
"answer": "25 Desember 1912",
"type": "isian"
},
{
"tokens": [
"Joko",
"Widodo",
"dilantik",
"sebagai",
"Presiden",
"Republik",
"Indonesia",
"pada",
"20",
"Oktober",
"2014",
"."
],
"ner": [
"B-PER",
"I-PER",
"O",
"O",
"B-TITLE",
"B-ORG",
"I-ORG",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG0",
"ARG0",
"V",
"O",
"ARG1",
"ARG1",
"ARG1",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Kapan Joko Widodo dilantik sebagai Presiden Republik Indonesia?",
"answer": "20 Oktober 2014",
"type": "isian"
},
{
"tokens": [
"Soekarno",
"adalah",
"presiden",
"pertama",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden pertama di Indonesia?",
"answer": "Soekarno",
"type": "isian"
},
{
"tokens": [
"Mohammad",
"Hatta",
"menjabat",
"sebagai",
"wakil",
"presiden",
"pertama",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "O", "ARG1", "ARG1", "ARG1", "ARGM-LOC", "O"],
"question": "Siapa wakil presiden pertama Indonesia?",
"answer": "Mohammad Hatta",
"type": "isian"
},
{
"tokens": [
"Maruf",
"Amin",
"dilantik",
"sebagai",
"wakil",
"presiden",
"pada",
"20",
"Oktober",
"2019",
"."
],
"ner": [
"B-PER",
"I-PER",
"O",
"O",
"O",
"O",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG0",
"ARG0",
"V",
"O",
"ARG1",
"ARG1",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Kapan Maruf Amin dilantik sebagai wakil presiden?",
"answer": "20 Oktober 2019",
"type": "isian"
},
{
"tokens": [
"Soeharto",
"adalah",
"presiden",
"kedua",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden kedua di Indonesia?",
"answer": "Soeharto",
"type": "isian"
},
{
"tokens": [
"B.J.",
"Habibie",
"adalah",
"presiden",
"ketiga",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden ketiga di Indonesia?",
"answer": "B.J. Habibie",
"type": "isian"
},
{
"tokens": [
"Abdurrahman",
"Wahid",
"adalah",
"presiden",
"keempat",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden keempat di Indonesia?",
"answer": "Abdurrahman Wahid",
"type": "isian"
},
{
"tokens": [
"Megawati",
"Soekarnoputri",
"adalah",
"presiden",
"kelima",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden kelima di Indonesia?",
"answer": "Megawati Soekarnoputri",
"type": "isian"
},
{
"tokens": [
"Susilo",
"Bambang",
"Yudhoyono",
"adalah",
"presiden",
"keenam",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa presiden keenam di Indonesia?",
"answer": "Susilo Bambang Yudhoyono",
"type": "isian"
},
{
"tokens": [
"Joko",
"Widodo",
"adalah",
"presiden",
"ketujuh",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden ketujuh di Indonesia?",
"answer": "Joko Widodo",
"type": "isian"
},
{
"tokens": [
"Prabowo",
"Subianto",
"adalah",
"presiden",
"kedelapan",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa presiden kedelapan di Indonesia?",
"answer": "Prabowo Subianto",
"type": "isian"
},
{
"tokens": [
"Mohammad",
"Hatta",
"adalah",
"wakil",
"presiden",
"pertama",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden pertama di Indonesia?",
"answer": "Mohammad Hatta",
"type": "isian"
},
{
"tokens": [
"Sri",
"Sultan",
"Hamengkubuwono",
"IX",
"adalah",
"wakil",
"presiden",
"kedua",
"di",
"Indonesia",
"."
],
"ner": [
"B-PER",
"I-PER",
"I-PER",
"I-PER",
"O",
"O",
"O",
"O",
"O",
"B-LOC",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden kedua di Indonesia?",
"answer": "Sri Sultan Hamengkubuwono IX",
"type": "isian"
},
{
"tokens": [
"Adam",
"Malik",
"adalah",
"wakil",
"presiden",
"ketiga",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden ketiga di Indonesia?",
"answer": "Adam Malik",
"type": "isian"
},
{
"tokens": [
"Umar",
"Wirahadikusumah",
"adalah",
"wakil",
"presiden",
"keempat",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden keempat di Indonesia?",
"answer": "Umar Wirahadikusumah",
"type": "isian"
},
{
"tokens": [
"Sudharmono",
"adalah",
"wakil",
"presiden",
"kelima",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "V", "ARG1", "ARG1", "ARG1", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Siapa wakil presiden kelima di Indonesia?",
"answer": "Sudharmono",
"type": "isian"
},
{
"tokens": [
"Try",
"Sutrisno",
"adalah",
"wakil",
"presiden",
"keenam",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden keenam di Indonesia?",
"answer": "Try Sutrisno",
"type": "isian"
},
{
"tokens": [
"B.J.",
"Habibie",
"adalah",
"wakil",
"presiden",
"ketujuh",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden ketujuh di Indonesia?",
"answer": "B.J. Habibie",
"type": "isian"
},
{
"tokens": [
"Megawati",
"Soekarnoputri",
"adalah",
"wakil",
"presiden",
"kedelapan",
"di",
"Indonesia",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Siapa wakil presiden kedelapan di Indonesia?",
"answer": "Megawati Soekarnoputri",
"type": "isian"
},
{
"tokens": [
"B.J.",
"Habibie",
"lahir",
"di",
"Parepare",
"pada",
"25",
"Juni",
"1936",
"."
],
"ner": [
"B-PER",
"I-PER",
"O",
"O",
"B-LOC",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG0",
"ARG0",
"V",
"ARGM-LOC",
"ARGM-LOC",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Di mana B.J. Habibie lahir?",
"answer": "Parepare",
"type": "isian"
},
{
"tokens": ["Indonesia", "merdeka", "pada", "17", "Agustus", "1945", "."],
"ner": ["B-LOC", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"],
"srl": ["ARG0", "V", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O"],
"question": "Kapan Indonesia merdeka?",
"answer": "17 Agustus 1945",
"type": "isian"
},
{
"tokens": ["R.A.", "Kartini", "berasal", "dari", "Jepara", "."],
"ner": ["B-PER", "I-PER", "O", "O", "B-LOC", "O"],
"srl": ["ARG0", "ARG0", "V", "ARGM-LOC", "ARGM-LOC", "O"],
"question": "Dari mana R.A. Kartini berasal?",
"answer": "Jepara",
"type": "isian"
},
{
"tokens": [
"Candi",
"Borobudur",
"terletak",
"di",
"Magelang",
",",
"Jawa",
"Tengah",
"."
],
"ner": ["B-LOC", "I-LOC", "O", "O", "B-LOC", "O", "B-LOC", "I-LOC", "O"],
"srl": [
"ARG1",
"ARG1",
"V",
"ARGM-LOC",
"ARGM-LOC",
"O",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Di mana letak Candi Borobudur?",
"answer": "Magelang, Jawa Tengah",
"type": "isian"
},
{
"tokens": [
"Sumpah",
"Pemuda",
"dideklarasikan",
"pada",
"28",
"Oktober",
"1928",
"."
],
"ner": ["B-EVENT", "I-EVENT", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"],
"srl": [
"ARG1",
"ARG1",
"V",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Kapan Sumpah Pemuda dideklarasikan?",
"answer": "28 Oktober 1928",
"type": "isian"
},
{
"tokens": [
"Ir.",
"Soekarno",
"belajar",
"di",
"Technische",
"Hogeschool",
"di",
"Bandung",
"."
],
"ner": ["B-PER", "I-PER", "O", "O", "B-ORG", "I-ORG", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"ARG0",
"V",
"O",
"ARGM-LOC",
"ARGM-LOC",
"O",
"ARGM-LOC",
"O"
],
"question": "Di mana Ir. Soekarno belajar?",
"answer": "Technische Hogeschool di Bandung",
"type": "isian"
},
{
"tokens": [
"BMKG",
"merupakan",
"badan",
"yang",
"mengawasi",
"cuaca",
"dan",
"iklim",
"di",
"Indonesia",
"."
],
"ner": ["B-ORG", "O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O"],
"srl": [
"ARG0",
"V",
"ARG1",
"R-ARG1",
"R-ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Apa fungsi BMKG?",
"answer": "Mengawasi cuaca dan iklim di Indonesia",
"type": "isian"
},
{
"tokens": [
"UNESCO",
"menetapkan",
"batik",
"sebagai",
"warisan",
"budaya",
"takbenda",
"pada",
"2009",
"."
],
"ner": ["B-ORG", "O", "O", "O", "O", "O", "O", "O", "B-DATE", "O"],
"srl": [
"ARG0",
"V",
"ARG1",
"ARG2",
"ARG2",
"ARG2",
"ARG2",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Kapan batik ditetapkan sebagai warisan budaya oleh UNESCO?",
"answer": "2009",
"type": "isian"
},
{
"tokens": [
"Pendidikan",
"Taman",
"Siswa",
"didirikan",
"oleh",
"Ki",
"Hajar",
"Dewantara",
"pada",
"3",
"Juli",
"1922",
"."
],
"ner": [
"B-ORG",
"I-ORG",
"I-ORG",
"O",
"O",
"B-PER",
"I-PER",
"I-PER",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG1",
"ARG1",
"ARG1",
"V",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Kapan Taman Siswa didirikan?",
"answer": "3 Juli 1922",
"type": "isian"
},
{
"tokens": [
"Gunung",
"Bromo",
"merupakan",
"salah",
"satu",
"destinasi",
"wisata",
"di",
"Jawa",
"Timur",
"."
],
"ner": [
"B-LOC",
"I-LOC",
"O",
"O",
"O",
"O",
"O",
"O",
"B-LOC",
"I-LOC",
"O"
],
"srl": [
"ARG1",
"ARG1",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARGM-LOC",
"ARGM-LOC",
"ARGM-LOC",
"O"
],
"question": "Gunung Bromo terletak di mana?",
"answer": "Jawa Timur",
"type": "isian"
}
]

2862
QC/dataset_qc_tokenized.json Normal file

File diff suppressed because it is too large Load Diff

BIN
QC/lstm_qg.keras Normal file

Binary file not shown.

272418
QC/new_dataset.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

247
QC/output.tsv Normal file
View File

@ -0,0 +1,247 @@
Kerajaan O O
Singhasari B-ORG ARG1
didirikan O V
oleh O O
Ken B-PER ARG0
Arok I-PER ARG0
pada O O
1222 B-DATE ARGM-TMP
di O O
Tumapel B-LOC ARGM-LOC
. O O
Kerajaan O O
Srivijaya B-ORG ARG1
didirikan O V
oleh O O
Dapunta B-PER ARG0
Hyang I-PER ARG0
Sri I-PER ARG0
Jayanasa I-PER ARG0
pada O O
683 B-DATE ARGM-TMP
di O O
Palembang B-LOC ARGM-LOC
. O O
Kerajaan O O
Kutai B-ORG ARG1
Martadipura I-ORG ARG1
didirikan O V
oleh O O
Kudungga B-PER ARG0
pada O O
350 B-DATE ARGM-TMP
di O O
Kutai B-LOC ARGM-LOC
Kartanegara I-LOC ARGM-LOC
. O O
Kerajaan O O
Tarumanagara B-ORG ARG1
didirikan O V
oleh O O
Jayasingawarman B-PER ARG0
pada O O
358 B-DATE ARGM-TMP
di O O
Bogor B-LOC ARGM-LOC
. O O
Kerajaan O O
Majapahit B-ORG ARG1
didirikan O V
oleh O O
Raden B-PER ARG0
Wijaya I-PER ARG0
pada O O
1293 B-DATE ARGM-TMP
di O O
Trowulan B-LOC ARGM-LOC
. O O
Boedi B-ORG ARG1
Oetomo I-ORG ARG1
didirikan O V
oleh O O
Dr. B-PER ARG0
Soetomo I-PER ARG0
pada O O
20 B-DATE ARGM-TMP
Mei I-DATE ARGM-TMP
1908 I-DATE ARGM-TMP
di O O
Batavia B-LOC ARGM-LOC
. O O
Reformasi B-EVENT ARG1
1998 I-EVENT ARG1
dimulai O V
pada O O
21 B-DATE ARGM-TMP
Mei I-DATE ARGM-TMP
1998 I-DATE ARGM-TMP
di O O
Jakarta B-LOC ARGM-LOC
. O O
Soekarno B-PER ARG0
lahir O V
pada O O
6 B-DATE ARGM-TMP
Juni I-DATE ARGM-TMP
1901 I-DATE ARGM-TMP
di O O
Surabaya B-LOC ARGM-LOC
. O O
Tan B-PER ARG0
Malaka I-PER ARG0
lahir O V
pada O O
2 B-DATE ARGM-TMP
Juni I-DATE ARGM-TMP
1897 I-DATE ARGM-TMP
di O O
Lima B-LOC ARGM-LOC
Puluh I-LOC ARGM-LOC
Kota I-LOC ARGM-LOC
. O O
Nahdlatul B-ORG ARG1
Ulama I-ORG ARG1
didirikan O V
oleh O O
Hasyim B-PER ARG0
Asy'ari I-PER ARG0
pada O O
31 B-DATE ARGM-TMP
Januari I-DATE ARGM-TMP
1926 I-DATE ARGM-TMP
di O O
Surabaya B-LOC ARGM-LOC
. O O
Sumpah B-EVENT ARG1
Pemuda I-EVENT ARG1
diikrarkan O V
pada O O
28 B-DATE ARGM-TMP
Oktober I-DATE ARGM-TMP
1928 I-DATE ARGM-TMP
di O O
Jakarta B-LOC ARGM-LOC
. O O
Ki B-PER ARG0
Hajar I-PER ARG0
Dewantara I-PER ARG0
lahir O V
pada O O
2 B-DATE ARGM-TMP
Mei I-DATE ARGM-TMP
1889 I-DATE ARGM-TMP
di O O
Yogyakarta B-LOC ARGM-LOC
. O O
Sarekat B-ORG ARG1
Islam I-ORG ARG1
didirikan O V
oleh O O
Haji B-PER ARG0
Samanhudi I-PER ARG0
pada O O
16 B-DATE ARGM-TMP
Oktober I-DATE ARGM-TMP
1905 I-DATE ARGM-TMP
di O O
Surakarta B-LOC ARGM-LOC
. O O
Partai B-ORG ARG1
Nasional I-ORG ARG1
Indonesia I-ORG ARG1
didirikan O V
oleh O O
Soekarno B-PER ARG0
pada O O
4 B-DATE ARGM-TMP
Juli I-DATE ARGM-TMP
1927 I-DATE ARGM-TMP
di O O
Bandung B-LOC ARGM-LOC
. O O
Muhammadiyah B-ORG ARG1
didirikan O V
oleh O O
Ahmad B-PER ARG0
Dahlan I-PER ARG0
pada O O
18 B-DATE ARGM-TMP
November I-DATE ARGM-TMP
1912 I-DATE ARGM-TMP
di O O
Yogyakarta B-LOC ARGM-LOC
. O O
Konferensi B-EVENT ARG1
Asia I-EVENT ARG1
- I-EVENT ARG1
Afrika I-EVENT ARG1
berlangsung O V
pada O O
18 B-DATE ARGM-TMP
April I-DATE ARGM-TMP
1955 I-DATE ARGM-TMP
di O O
Bandung B-LOC ARGM-LOC
. O O
Raden B-PER ARG0
Ajeng I-PER ARG0
Kartini I-PER ARG0
lahir O V
pada O O
21 B-DATE ARGM-TMP
April I-DATE ARGM-TMP
1879 I-DATE ARGM-TMP
di O O
Jepara B-LOC ARGM-LOC
. O O
Mohammad B-PER ARG0
Hatta I-PER ARG0
lahir O V
pada O O
12 B-DATE ARGM-TMP
Agustus I-DATE ARGM-TMP
1902 I-DATE ARGM-TMP
di O O
Bukittinggi B-LOC ARGM-LOC
. O O
Proklamasi B-EVENT ARG1
Kemerdekaan I-EVENT ARG1
Indonesia I-EVENT ARG1
diproklamasikan O V
pada O O
17 B-DATE ARGM-TMP
Agustus I-DATE ARGM-TMP
1945 I-DATE ARGM-TMP
di O O
Jakarta B-LOC ARGM-LOC
. O O
Pertempuran B-EVENT ARG1
Surabaya I-EVENT ARG1
terjadi O V
pada O O
10 B-DATE ARGM-TMP
November I-DATE ARGM-TMP
1945 I-DATE ARGM-TMP
di O O
Surabaya B-LOC ARGM-LOC
. O O
1 Kerajaan O O
2 Singhasari B-ORG ARG1
3 didirikan O V
4 oleh O O
5 Ken B-PER ARG0
6 Arok I-PER ARG0
7 pada O O
8 1222 B-DATE ARGM-TMP
9 di O O
10 Tumapel B-LOC ARGM-LOC
11 . O O
12 Kerajaan O O
13 Srivijaya B-ORG ARG1
14 didirikan O V
15 oleh O O
16 Dapunta B-PER ARG0
17 Hyang I-PER ARG0
18 Sri I-PER ARG0
19 Jayanasa I-PER ARG0
20 pada O O
21 683 B-DATE ARGM-TMP
22 di O O
23 Palembang B-LOC ARGM-LOC
24 . O O
25 Kerajaan O O
26 Kutai B-ORG ARG1
27 Martadipura I-ORG ARG1
28 didirikan O V
29 oleh O O
30 Kudungga B-PER ARG0
31 pada O O
32 350 B-DATE ARGM-TMP
33 di O O
34 Kutai B-LOC ARGM-LOC
35 Kartanegara I-LOC ARGM-LOC
36 . O O
37 Kerajaan O O
38 Tarumanagara B-ORG ARG1
39 didirikan O V
40 oleh O O
41 Jayasingawarman B-PER ARG0
42 pada O O
43 358 B-DATE ARGM-TMP
44 di O O
45 Bogor B-LOC ARGM-LOC
46 . O O
47 Kerajaan O O
48 Majapahit B-ORG ARG1
49 didirikan O V
50 oleh O O
51 Raden B-PER ARG0
52 Wijaya I-PER ARG0
53 pada O O
54 1293 B-DATE ARGM-TMP
55 di O O
56 Trowulan B-LOC ARGM-LOC
57 . O O
58 Boedi B-ORG ARG1
59 Oetomo I-ORG ARG1
60 didirikan O V
61 oleh O O
62 Dr. B-PER ARG0
63 Soetomo I-PER ARG0
64 pada O O
65 20 B-DATE ARGM-TMP
66 Mei I-DATE ARGM-TMP
67 1908 I-DATE ARGM-TMP
68 di O O
69 Batavia B-LOC ARGM-LOC
70 . O O
71 Reformasi B-EVENT ARG1
72 1998 I-EVENT ARG1
73 dimulai O V
74 pada O O
75 21 B-DATE ARGM-TMP
76 Mei I-DATE ARGM-TMP
77 1998 I-DATE ARGM-TMP
78 di O O
79 Jakarta B-LOC ARGM-LOC
80 . O O
81 Soekarno B-PER ARG0
82 lahir O V
83 pada O O
84 6 B-DATE ARGM-TMP
85 Juni I-DATE ARGM-TMP
86 1901 I-DATE ARGM-TMP
87 di O O
88 Surabaya B-LOC ARGM-LOC
89 . O O
90 Tan B-PER ARG0
91 Malaka I-PER ARG0
92 lahir O V
93 pada O O
94 2 B-DATE ARGM-TMP
95 Juni I-DATE ARGM-TMP
96 1897 I-DATE ARGM-TMP
97 di O O
98 Lima B-LOC ARGM-LOC
99 Puluh I-LOC ARGM-LOC
100 Kota I-LOC ARGM-LOC
101 . O O
102 Nahdlatul B-ORG ARG1
103 Ulama I-ORG ARG1
104 didirikan O V
105 oleh O O
106 Hasyim B-PER ARG0
107 Asy'ari I-PER ARG0
108 pada O O
109 31 B-DATE ARGM-TMP
110 Januari I-DATE ARGM-TMP
111 1926 I-DATE ARGM-TMP
112 di O O
113 Surabaya B-LOC ARGM-LOC
114 . O O
115 Sumpah B-EVENT ARG1
116 Pemuda I-EVENT ARG1
117 diikrarkan O V
118 pada O O
119 28 B-DATE ARGM-TMP
120 Oktober I-DATE ARGM-TMP
121 1928 I-DATE ARGM-TMP
122 di O O
123 Jakarta B-LOC ARGM-LOC
124 . O O
125 Ki B-PER ARG0
126 Hajar I-PER ARG0
127 Dewantara I-PER ARG0
128 lahir O V
129 pada O O
130 2 B-DATE ARGM-TMP
131 Mei I-DATE ARGM-TMP
132 1889 I-DATE ARGM-TMP
133 di O O
134 Yogyakarta B-LOC ARGM-LOC
135 . O O
136 Sarekat B-ORG ARG1
137 Islam I-ORG ARG1
138 didirikan O V
139 oleh O O
140 Haji B-PER ARG0
141 Samanhudi I-PER ARG0
142 pada O O
143 16 B-DATE ARGM-TMP
144 Oktober I-DATE ARGM-TMP
145 1905 I-DATE ARGM-TMP
146 di O O
147 Surakarta B-LOC ARGM-LOC
148 . O O
149 Partai B-ORG ARG1
150 Nasional I-ORG ARG1
151 Indonesia I-ORG ARG1
152 didirikan O V
153 oleh O O
154 Soekarno B-PER ARG0
155 pada O O
156 4 B-DATE ARGM-TMP
157 Juli I-DATE ARGM-TMP
158 1927 I-DATE ARGM-TMP
159 di O O
160 Bandung B-LOC ARGM-LOC
161 . O O
162 Muhammadiyah B-ORG ARG1
163 didirikan O V
164 oleh O O
165 Ahmad B-PER ARG0
166 Dahlan I-PER ARG0
167 pada O O
168 18 B-DATE ARGM-TMP
169 November I-DATE ARGM-TMP
170 1912 I-DATE ARGM-TMP
171 di O O
172 Yogyakarta B-LOC ARGM-LOC
173 . O O
174 Konferensi B-EVENT ARG1
175 Asia I-EVENT ARG1
176 - I-EVENT ARG1
177 Afrika I-EVENT ARG1
178 berlangsung O V
179 pada O O
180 18 B-DATE ARGM-TMP
181 April I-DATE ARGM-TMP
182 1955 I-DATE ARGM-TMP
183 di O O
184 Bandung B-LOC ARGM-LOC
185 . O O
186 Raden B-PER ARG0
187 Ajeng I-PER ARG0
188 Kartini I-PER ARG0
189 lahir O V
190 pada O O
191 21 B-DATE ARGM-TMP
192 April I-DATE ARGM-TMP
193 1879 I-DATE ARGM-TMP
194 di O O
195 Jepara B-LOC ARGM-LOC
196 . O O
197 Mohammad B-PER ARG0
198 Hatta I-PER ARG0
199 lahir O V
200 pada O O
201 12 B-DATE ARGM-TMP
202 Agustus I-DATE ARGM-TMP
203 1902 I-DATE ARGM-TMP
204 di O O
205 Bukittinggi B-LOC ARGM-LOC
206 . O O
207 Proklamasi B-EVENT ARG1
208 Kemerdekaan I-EVENT ARG1
209 Indonesia I-EVENT ARG1
210 diproklamasikan O V
211 pada O O
212 17 B-DATE ARGM-TMP
213 Agustus I-DATE ARGM-TMP
214 1945 I-DATE ARGM-TMP
215 di O O
216 Jakarta B-LOC ARGM-LOC
217 . O O
218 Pertempuran B-EVENT ARG1
219 Surabaya I-EVENT ARG1
220 terjadi O V
221 pada O O
222 10 B-DATE ARGM-TMP
223 November I-DATE ARGM-TMP
224 1945 I-DATE ARGM-TMP
225 di O O
226 Surabaya B-LOC ARGM-LOC
227 . O O

15
QC/qc_v2.py Normal file
View File

@ -0,0 +1,15 @@
import json
# load dataset
with open("dataset_qc.json", encoding="utf-8") as f:
raw_data = json.load(f)
tokens = [[t.lower().strip() for t in item["tokens"]] for item in raw_data]
ner_tags = [item["ner"] for item in raw_data]
srl_tags = [item["srl"] for item in raw_data]
questions = [item["question"].lower().strip() for item in raw_data]
answers = [item["answer"].lower().strip() for item in raw_data]
types = [item["type"] for item in raw_data]

499
QC/qg_train.ipynb Normal file

File diff suppressed because one or more lines are too long

269
QC/qg_v2_train.ipynb Normal file
View File

@ -0,0 +1,269 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0a2880d7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-04-23 14:22:17.809700: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-04-23 14:22:17.810231: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-04-23 14:22:17.812492: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n",
"2025-04-23 14:22:17.818482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1745392937.829027 39341 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1745392937.832239 39341 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"W0000 00:00:1745392937.840149 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1745392937.840163 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1745392937.840164 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"W0000 00:00:1745392937.840165 39341 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
"2025-04-23 14:22:17.843058: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"import json\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, TimeDistributed, Dense\n",
"from tensorflow.keras.callbacks import EarlyStopping"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bd82907a",
"metadata": {},
"outputs": [],
"source": [
"with open(\"dataset_qc.json\", encoding=\"utf-8\") as f:\n",
" raw_data = json.load(f)\n",
"\n",
"tokens = [[t.lower().strip() for t in item[\"tokens\"]] for item in raw_data]\n",
"ner_tags = [item[\"ner\"] for item in raw_data]\n",
"srl_tags = [item[\"srl\"] for item in raw_data]\n",
"questions = [item[\"question\"].lower().strip() for item in raw_data]\n",
"answers = [item[\"answer\"].lower().strip() for item in raw_data]\n",
"types = [item[\"type\"] for item in raw_data]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "946713ee",
"metadata": {},
"outputs": [],
"source": [
"token_tokenizer = Tokenizer(lower=False, oov_token=\"<OOV>\")\n",
"token_tokenizer.fit_on_texts(tokens)\n",
"token_sequences = token_tokenizer.texts_to_sequences(tokens)\n",
"\n",
"ner_encoder = LabelEncoder()\n",
"srl_encoder = LabelEncoder()\n",
"\n",
"flat_ner = [tag for seq in ner_tags for tag in seq]\n",
"flat_srl = [tag for seq in srl_tags for tag in seq]\n",
"\n",
"ner_encoder.fit(flat_ner)\n",
"srl_encoder.fit(flat_srl)\n",
"\n",
"ner_sequences = [ner_encoder.transform(seq).tolist() for seq in ner_tags]\n",
"srl_sequences = [srl_encoder.transform(seq).tolist() for seq in srl_tags]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "aff6e7aa",
"metadata": {},
"outputs": [],
"source": [
"MAX_LEN = max(len(seq) for seq in token_sequences)\n",
"\n",
"token_padded = pad_sequences(token_sequences, maxlen=MAX_LEN, padding='post')\n",
"ner_padded = pad_sequences(ner_sequences, maxlen=MAX_LEN, padding='post')\n",
"srl_padded = pad_sequences(srl_sequences, maxlen=MAX_LEN, padding='post')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ea2ab113",
"metadata": {},
"outputs": [],
"source": [
"qa_tokenizer = Tokenizer(oov_token=\"<OOV>\")\n",
"qa_tokenizer.fit_on_texts(questions + answers)\n",
"\n",
"question_sequences = qa_tokenizer.texts_to_sequences(questions)\n",
"answer_sequences = qa_tokenizer.texts_to_sequences(answers)\n",
"\n",
"question_padded = pad_sequences(question_sequences, maxlen=MAX_LEN, padding='post')\n",
"answer_padded = pad_sequences(answer_sequences, maxlen=MAX_LEN, padding='post')\n",
"\n",
"\n",
"type_encoder = LabelEncoder()\n",
"type_labels = type_encoder.fit_transform(types) # bentuk 1D array\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2becb56",
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'Tokenizer' object has no attribute 'shape'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 10\u001b[0m\n\u001b[1;32m 5\u001b[0m y_answer \u001b[38;5;241m=\u001b[39m answer_padded\n\u001b[1;32m 6\u001b[0m y_type \u001b[38;5;241m=\u001b[39m type_labels\n\u001b[0;32m---> 10\u001b[0m MAX_LEN \u001b[38;5;241m=\u001b[39m \u001b[43mX_token\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# 2. Parameter\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# ======================\u001b[39;00m\n\u001b[1;32m 15\u001b[0m VOCAB_TOKEN \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mmax(X_token) \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'Tokenizer' object has no attribute 'shape'"
]
}
],
"source": [
"X_token = token_tokenizer\n",
"X_ner = ner_encoder\n",
"X_srl = srl_encoder\n",
"y_question = qa_tokenizer\n",
"y_answer = answer_padded\n",
"y_type = type_labels\n",
"\n",
"\n",
"\n",
"MAX_LEN = X_token.shape[1]\n",
"\n",
"# ======================\n",
"# 2. Parameter\n",
"# ======================\n",
"VOCAB_TOKEN = np.max(X_token) + 1\n",
"VOCAB_NER = np.max(X_ner) + 1\n",
"VOCAB_SRL = np.max(X_srl) + 1\n",
"VOCAB_QA = max(np.max(y_question), np.max(y_answer)) + 1\n",
"NUM_TYPES = len(np.unique(y_type))\n",
"\n",
"EMB_TOKEN = 128\n",
"EMB_TAG = 16\n",
"LSTM_UNITS = 256"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "162a155a",
"metadata": {},
"outputs": [],
"source": [
"input_token = Input(shape=(MAX_LEN,), name=\"token_input\")\n",
"input_ner = Input(shape=(MAX_LEN,), name=\"ner_input\")\n",
"input_srl = Input(shape=(MAX_LEN,), name=\"srl_input\")\n",
"\n",
"# ======================\n",
"# 4. Embedding\n",
"# ======================\n",
"embed_token = Embedding(input_dim=VOCAB_TOKEN, output_dim=EMB_TOKEN)(input_token)\n",
"embed_ner = Embedding(input_dim=VOCAB_NER, output_dim=EMB_TAG)(input_ner)\n",
"embed_srl = Embedding(input_dim=VOCAB_SRL, output_dim=EMB_TAG)(input_srl)\n",
"\n",
"# Gabung semua embedding\n",
"merged = Concatenate()([embed_token, embed_ner, embed_srl])\n",
"\n",
"# ======================\n",
"# 5. LSTM\n",
"# ======================\n",
"lstm_out = LSTM(LSTM_UNITS, return_sequences=True)(merged)\n",
"\n",
"# Output: Question\n",
"question_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"question_output\")(lstm_out)\n",
"\n",
"# Output: Answer\n",
"answer_out = TimeDistributed(Dense(VOCAB_QA, activation='softmax'), name=\"answer_output\")(lstm_out)\n",
"\n",
"# Output: Type (klasifikasi)\n",
"type_repr = LSTM(LSTM_UNITS)(merged) # pakai output dari awal sebelum LSTM pertama\n",
"type_out = Dense(NUM_TYPES, activation='softmax', name=\"type_output\")(type_repr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cccf561",
"metadata": {},
"outputs": [],
"source": [
"model = Model(inputs=[input_token, input_ner, input_srl],\n",
" outputs=[question_out, answer_out, type_out])\n",
"\n",
"model.compile(\n",
" optimizer='adam',\n",
" loss={\n",
" \"question_output\": \"sparse_categorical_crossentropy\",\n",
" \"answer_output\": \"sparse_categorical_crossentropy\",\n",
" \"type_output\": \"sparse_categorical_crossentropy\",\n",
" },\n",
" metrics={\n",
" \"question_output\": \"accuracy\",\n",
" \"answer_output\": \"accuracy\",\n",
" \"type_output\": \"accuracy\",\n",
" }\n",
")\n",
"\n",
"# ======================\n",
"# 7. Training\n",
"# ======================\n",
"y_question = np.expand_dims(y_question, -1) # untuk sparse categorical loss\n",
"y_answer = np.expand_dims(y_answer, -1)\n",
"\n",
"earlystop = EarlyStopping(patience=4, restore_best_weights=True)\n",
"\n",
"model.fit(\n",
" [X_token, X_ner, X_srl],\n",
" [y_question, y_answer, y_type],\n",
" batch_size=32,\n",
" epochs=30,\n",
" validation_split=0.1,\n",
" callbacks=[earlystop]\n",
")\n",
"\n",
"# ======================\n",
"# 8. Simpan Model\n",
"# ======================\n",
"model.save(\"model_lstm_qg.h5\")\n",
"print(\"✅ Training selesai. Model disimpan.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "myenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -22,7 +22,7 @@ random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
TRAIN_FILE = "../dataset/dataset_qc.json"
TRAIN_FILE = "QC/dataset_qc.json"
VALID_RATIO = 0.10
MAX_CTX_LEN = 50
MAX_Q_LEN = 30

View File

@ -1,58 +1,150 @@
MAX_CTX_LEN = 50
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import numpy as np
# -- dummy placeholder untuk model NER/SRL Anda -------------------------------
def predict_ner(tokens): # ganti sesuai implementasi
return ["O"] * len(tokens)
def infer_from_input(input_data, maxlen=50):
with open("QC/tokenizers.pkl", "rb") as f:
tokenizers = pickle.load(f)
def predict_srl(tokens): # ganti sesuai implementasi
return ["O"] * len(tokens)
model = load_model("QC/lstm_qg.keras")
tok_token = tokenizers["token"]
tok_ner = tokenizers["ner"]
tok_srl = tokenizers["srl"]
tok_q = tokenizers["question"]
tok_a = tokenizers["answer"]
tok_type = tokenizers["type"]
# ------------------------------------------------------------------------------
# Prepare input
tokens = input_data["tokens"]
ner = input_data["ner"]
srl = input_data["srl"]
x_tok = pad_sequences(
[tok_token.texts_to_sequences([tokens])[0]], maxlen=maxlen, padding="post"
)
x_ner = pad_sequences(
[tok_ner.texts_to_sequences([ner])[0]], maxlen=maxlen, padding="post"
)
x_srl = pad_sequences(
[tok_srl.texts_to_sequences([srl])[0]], maxlen=maxlen, padding="post"
)
def greedy_decode(context_tokens):
"""Menghasilkan satu pertanyaan (greedy)."""
# 6.1 Tagging
ner_tags = predict_ner(context_tokens)
srl_tags = predict_srl(context_tokens)
# Predict
pred_q, pred_a, pred_type = model.predict([x_tok, x_ner, x_srl])
pred_q_ids = np.argmax(pred_q[0], axis=-1)
pred_a_ids = np.argmax(pred_a[0], axis=-1)
pred_type_id = np.argmax(pred_type[0])
# 6.2 Encode
ctx_ids = encode(context_tokens, w2i_ctx, MAX_CTX_LEN)[None]
ner_ids = encode(ner_tags, t2i_ner, MAX_CTX_LEN)[None]
srl_ids = encode(srl_tags, t2i_srl, MAX_CTX_LEN)[None]
# Decode
index2word_q = {v: k for k, v in tok_q.word_index.items()}
index2word_a = {v: k for k, v in tok_a.word_index.items()}
index2word_q[0] = "<PAD>"
index2word_a[0] = "<PAD>"
dec_seq = [w2i_q["<bos>"]]
for _ in range(MAX_Q_LEN - 1):
dec_pad = dec_seq + [w2i_q["<pad>"]] * (MAX_Q_LEN - len(dec_seq))
pred = model.predict(
[ctx_ids, ner_ids, srl_ids, np.array([dec_pad])], verbose=0
)
next_id = int(pred[0, len(dec_seq) - 1].argmax())
if i2w_q[next_id] == "<eos>":
break
dec_seq.append(next_id)
decoded_q = [index2word_q[i] for i in pred_q_ids if i != 0]
decoded_a = [index2word_a[i] for i in pred_a_ids if i != 0]
tokens_q = [i2w_q[t] for t in dec_seq[1:]]
return " ".join(tokens_q)
index2type = {v - 1: k for k, v in tok_type.word_index.items()}
decoded_type = index2type.get(pred_type_id, "unknown")
return {
"question": " ".join(decoded_q),
"answer": " ".join(decoded_a),
"type": decoded_type,
}
if __name__ == "__main__":
sample = [
"Keberagaman",
"potensi",
"sumber",
"daya",
"alam",
"Indonesia",
"tidak",
"lepas",
"dari",
"proses",
"geografis",
".",
]
print("\n[CTX]", " ".join(sample))
print("[Q] ", greedy_decode(sample))
# Example input
input_data = {
"tokens": [
"Ki",
"Hajar",
"Dewantara",
"lahir",
"pada",
"2",
"Mei",
"1889",
"di",
"Yogyakarta",
".",
],
"ner": [
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O",
"B-LOC",
"O",
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O",
"ARGM-LOC",
"O",
],
}
# input_data = {
# "tokens": [
# "Proklamasi",
# "Kemerdekaan",
# "Indonesia",
# "diproklamasikan",
# "pada",
# "17",
# "Agustus",
# "1945",
# "di",
# "Jakarta",
# ".",
# ],
# "ner": [
# "B-EVENT",
# "I-EVENT",
# "I-EVENT",
# "O",
# "O",
# "B-DATE",
# "I-DATE",
# "I-DATE",
# "O",
# "B-LOC",
# "O",
# ],
# "srl": [
# "ARG1",
# "ARG1",
# "ARG1",
# "V",
# "O",
# "ARGM-TMP",
# "ARGM-TMP",
# "ARGM-TMP",
# "O",
# "ARGM-LOC",
# "O",
# ],
# }
# Predict
result = infer_from_input(input_data)
print(result)

BIN
QC/tokenizers.pkl Normal file

Binary file not shown.

View File

@ -1,136 +0,0 @@
[
{
"tokens": [
"R.",
"Soewardi",
"Soerjaningrat",
"adalah",
"putra",
"GPH",
"Soerjaningrat",
"dan",
"cucu",
"Pakualam",
"III",
"."
],
"ner": [
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-PER",
"I-PER",
"O",
"O",
"B-PER",
"I-PER",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"V",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"ARG1",
"O"
],
"question": "___ adalah putra GPH Soerjaningrat dan cucu Pakualam III.",
"answer": "R. Soewardi Soerjaningrat",
"type": "isian"
},
{
"tokens": ["Ia", "lantas", "diterima", "belajar", "di", "STOVIA", "."],
"ner": ["O", "O", "O", "O", "O", "B-ORG", "O"],
"srl": ["ARG0", "O", "V", "ARG1", "O", "ARGM-LOC", "O"],
"question": "Ia diterima belajar di ___.",
"answer": "STOVIA",
"type": "isian"
},
{
"tokens": [
"Ia",
"bersama",
"Douwes",
"Dekker",
"dan",
"dr.",
"Cipto",
"Mangoenkoesoemo",
"lantas",
"mendirikan",
"Indische",
"Partij",
"pada",
"25",
"Desember",
"1912",
"."
],
"ner": [
"O",
"O",
"B-PER",
"I-PER",
"O",
"B-PER",
"I-PER",
"I-PER",
"O",
"O",
"B-ORG",
"I-ORG",
"O",
"B-DATE",
"I-DATE",
"I-DATE",
"O"
],
"srl": [
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"ARG0",
"O",
"V",
"ARG1",
"ARG1",
"O",
"ARGM-TMP",
"ARGM-TMP",
"ARGM-TMP",
"O"
],
"question": "Ia bersama Douwes Dekker dan dr. Cipto Mangoenkoesoemo lantas mendirikan ___ pada 25 Desember 1912.",
"answer": "Indische Partij",
"type": "isian"
},
{
"tokens": [
"Indische",
"Partij",
"didirikan",
"pada",
"25",
"Desember",
"1912",
"."
],
"ner": ["B-ORG", "I-ORG", "O", "O", "B-DATE", "I-DATE", "I-DATE", "O"],
"srl": ["ARG1", "ARG1", "V", "O", "ARGM-TMP", "ARGM-TMP", "ARGM-TMP", "O"],
"question": "Indische Partij didirikan pada tanggal ___.",
"answer": "25 Desember 1912",
"type": "isian"
}
]

View File

@ -2004,6 +2004,12 @@ Agustus I-DATE ARGM-TMP
1945 I-DATE ARGM-TMP
. O O
ibu B-PER ARG0
memasak O V
nasi O ARG1
di O O
dapur B-LOC ARGM-LOC
. O O
ibu B-PER ARG0
memasak O V
nasi O ARG1
@ -2011,7 +2017,7 @@ di O O
dapur B-LOC ARGM-LOC
. O O
R. B=PER ARG0
R. B-PER ARG0
Soewardi I-PER ARG0
Soerjaningrat I-PER ARG0
adalah O V
@ -2022,7 +2028,7 @@ dan O ARG1
cucu O ARG1
Pakualam B-PER ARG1
III I-PER ARG1
.
. O O
Ia O ARG0
bersama O ARG0
@ -2034,19 +2040,19 @@ Cipto I-PER ARG0
Mangoenkoesoemo I-PER ARG0
lantas O O
mendirikan O V
Indische B-ORG ARG1
Indische B-ORG ARG1
Partij I-ORG ARG1
pada O O
pada O O
25 B-DATE ARGM-TMP
Desember I-DATE ARGM-TMP
Desember I-DATE ARGM-TMP
1912 I-DATE ARGM-TMP
. O O
. O O
Indische B-ORG ARG1
Indische B-ORG ARG1
Partij I-ORG ARG1
didirikan O V
pada O O
didirikan O V
pada O O
25 B-DATE ARGM-TMP
Desember I-DATE ARGM-TMP
Desember I-DATE ARGM-TMP
1912 I-DATE ARGM-TMP
. O O
. O O

Can't render this file because it has a wrong number of fields in line 2025.