TIF_E41211115_lstm-quiz-gen.../QC/convert_dts.py

55 lines
2.0 KiB
Python

import json
import csv
from pathlib import Path
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
"""
Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
"""
# ---------------------------------------------------------------------
# 1. Baca semua record dari JSON
# ---------------------------------------------------------------------
with open(json_path, encoding="utf-8") as f:
records = json.load(f)
# ---------------------------------------------------------------------
# 2. Tulis ke TSV, sambil mendeteksi duplikat
# ---------------------------------------------------------------------
seen_sentences: set[tuple[str, ...]] = set() # simpan tuple tokens unik
with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
for rec in records:
tokens = rec["tokens"]
ner_tags = rec["ner"]
srl_tags = rec["srl"]
# -- cek panjang sama
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
raise ValueError("tokens, ner, dan srl harus punya panjang sama")
# -- cek duplikat kalimat
key = tuple(tokens) # tuple hash-able
if key in seen_sentences: # sudah pernah ditulis → skip
continue
seen_sentences.add(key)
# -- tulis baris token, NER, SRL
for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
writer.writerow([tok, ner, srl])
# -- baris kosong pemisah antar-kalimat
writer.writerow([])
print(f"✔️ TSV selesai, simpan di: {tsv_path}")
# ---------------------------------------------------------------------------
# CONTOH PEMAKAIAN
# ---------------------------------------------------------------------------
if __name__ == "__main__":
json_to_tsv("QC/dataset_combination.json", "QC/output.tsv")