import json import csv from pathlib import Path def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None: """ Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL. Kalimat duplikat (urutan tokens persis sama) otomatis dilewati. Jika ada record yang tokens, ner, dan srl tidak sama panjang, akan diberi info error lengkap. """ # --------------------------------------------------------------------- # 1. Baca semua record dari JSON # --------------------------------------------------------------------- with open(json_path, encoding="utf-8") as f: records = json.load(f) # --------------------------------------------------------------------- # 2. Tulis ke TSV, sambil mendeteksi duplikat # --------------------------------------------------------------------- seen_sentences: set[tuple[str, ...]] = set() # simpan tuple tokens unik with open(tsv_path, "w", encoding="utf-8", newline="") as f_out: writer = csv.writer(f_out, delimiter="\t", lineterminator="\n") for idx, rec in enumerate(records): tokens = rec.get("tokens") ner_tags = rec.get("ner") srl_tags = rec.get("srl") # -- cek panjang sama if not (len(tokens) == len(ner_tags) == len(srl_tags)): raise ValueError( f"❌ Panjang tidak sama di record index {idx}:\n" f" tokens ({len(tokens)}): {tokens}\n" f" ner ({len(ner_tags)}): {ner_tags}\n" f" srl ({len(srl_tags)}): {srl_tags}\n" ) # -- cek duplikat kalimat key = tuple(tokens) # tuple hash-able if key in seen_sentences: # sudah pernah ditulis → skip continue seen_sentences.add(key) # -- tulis baris token, NER, SRL for tok, ner, srl in zip(tokens, ner_tags, srl_tags): writer.writerow([tok, ner, srl]) # -- baris kosong pemisah antar-kalimat writer.writerow([]) print(f"✔️ TSV selesai, simpan di: {tsv_path}") # --------------------------------------------------------------------------- # CONTOH PEMAKAIAN # --------------------------------------------------------------------------- if __name__ == "__main__": json_to_tsv("QC/normalized_dataset.json", "QC/new_LNS.tsv")