TIF_E41211115_lstm-quiz-gen.../old/QC/convert_dts.py

122 lines
4.6 KiB
Python

import json
import csv
from pathlib import Path
import json
import csv
from pathlib import Path
# Daftar label NER yang valid (bisa disesuaikan)
VALID_NER_LABELS = {
"O",
"B-LOC", "I-LOC",
"B-PER", "I-PER",
"B-ORG", "I-ORG",
"B-DATE", "I-DATE",
"B-TIME", "I-TIME",
"B-EVENT", "I-EVENT"
}
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
"""
Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
Jika ada record yang tokens, ner, dan srl tidak sama panjang, atau ada label NER tidak valid, akan diberi info error lengkap.
"""
with open(json_path, encoding="utf-8") as f:
records = json.load(f)
seen_sentences: set[tuple[str, ...]] = set()
with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
for idx, rec in enumerate(records):
tokens = rec.get("tokens")
ner_tags = rec.get("ner")
srl_tags = rec.get("srl")
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
raise ValueError(
f"❌ Panjang tidak sama di record index {idx}:\n"
f" tokens ({len(tokens)}): {tokens}\n"
f" ner ({len(ner_tags)}): {ner_tags}\n"
f" srl ({len(srl_tags)}): {srl_tags}\n"
)
# Validasi label NER
for i, ner_label in enumerate(ner_tags):
if ner_label not in VALID_NER_LABELS:
raise ValueError(
f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
f" ner_label: {ner_label}\n"
f" value: {tokens}"
)
key = tuple(tokens)
if key in seen_sentences:
continue
seen_sentences.add(key)
for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
writer.writerow([tok, ner, srl])
writer.writerow([])
print(f"✔️ TSV selesai, simpan di: {tsv_path}")
# def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
# """
# Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
# Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
# Jika ada record yang tokens, ner, dan srl tidak sama panjang, akan diberi info error lengkap.
# """
# # ---------------------------------------------------------------------
# # 1. Baca semua record dari JSON
# # ---------------------------------------------------------------------
# with open(json_path, encoding="utf-8") as f:
# records = json.load(f)
# # ---------------------------------------------------------------------
# # 2. Tulis ke TSV, sambil mendeteksi duplikat
# # ---------------------------------------------------------------------
# seen_sentences: set[tuple[str, ...]] = set() # simpan tuple tokens unik
# with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
# writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
# for idx, rec in enumerate(records):
# tokens = rec.get("tokens")
# ner_tags = rec.get("ner")
# srl_tags = rec.get("srl")
# # -- cek panjang sama
# if not (len(tokens) == len(ner_tags) == len(srl_tags)):
# raise ValueError(
# f"❌ Panjang tidak sama di record index {idx}:\n"
# f" tokens ({len(tokens)}): {tokens}\n"
# f" ner ({len(ner_tags)}): {ner_tags}\n"
# f" srl ({len(srl_tags)}): {srl_tags}\n"
# )
# # -- cek duplikat kalimat
# key = tuple(tokens) # tuple hash-able
# if key in seen_sentences: # sudah pernah ditulis → skip
# continue
# seen_sentences.add(key)
# # -- tulis baris token, NER, SRL
# for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
# writer.writerow([tok, ner, srl])
# # -- baris kosong pemisah antar-kalimat
# writer.writerow([])
# print(f"✔️ TSV selesai, simpan di: {tsv_path}")
# ---------------------------------------------------------------------------
# CONTOH PEMAKAIAN
# ---------------------------------------------------------------------------
if __name__ == "__main__":
json_to_tsv("QC/normalize_dataset.json", "QC/new_LNS.tsv")