122 lines
4.6 KiB
Python
122 lines
4.6 KiB
Python
import json
|
|
import csv
|
|
from pathlib import Path
|
|
import json
|
|
import csv
|
|
from pathlib import Path
|
|
|
|
# Daftar label NER yang valid (bisa disesuaikan)
|
|
VALID_NER_LABELS = {
|
|
"O",
|
|
"B-LOC", "I-LOC",
|
|
"B-PER", "I-PER",
|
|
"B-ORG", "I-ORG",
|
|
"B-DATE", "I-DATE",
|
|
"B-TIME", "I-TIME",
|
|
"B-EVENT", "I-EVENT"
|
|
}
|
|
|
|
def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
|
"""
|
|
Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
|
|
Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
|
|
Jika ada record yang tokens, ner, dan srl tidak sama panjang, atau ada label NER tidak valid, akan diberi info error lengkap.
|
|
"""
|
|
with open(json_path, encoding="utf-8") as f:
|
|
records = json.load(f)
|
|
|
|
seen_sentences: set[tuple[str, ...]] = set()
|
|
|
|
with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
|
|
writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
|
|
|
|
for idx, rec in enumerate(records):
|
|
tokens = rec.get("tokens")
|
|
ner_tags = rec.get("ner")
|
|
srl_tags = rec.get("srl")
|
|
|
|
if not (len(tokens) == len(ner_tags) == len(srl_tags)):
|
|
raise ValueError(
|
|
f"❌ Panjang tidak sama di record index {idx}:\n"
|
|
f" tokens ({len(tokens)}): {tokens}\n"
|
|
f" ner ({len(ner_tags)}): {ner_tags}\n"
|
|
f" srl ({len(srl_tags)}): {srl_tags}\n"
|
|
)
|
|
|
|
# Validasi label NER
|
|
for i, ner_label in enumerate(ner_tags):
|
|
if ner_label not in VALID_NER_LABELS:
|
|
raise ValueError(
|
|
f"❌ Label NER tidak valid di record index {idx}, token ke-{i} ('{tokens[i]}'):\n"
|
|
f" ner_label: {ner_label}\n"
|
|
f" value: {tokens}"
|
|
)
|
|
|
|
key = tuple(tokens)
|
|
if key in seen_sentences:
|
|
continue
|
|
seen_sentences.add(key)
|
|
|
|
for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
|
|
writer.writerow([tok, ner, srl])
|
|
writer.writerow([])
|
|
|
|
print(f"✔️ TSV selesai, simpan di: {tsv_path}")
|
|
|
|
|
|
# def json_to_tsv(json_path: str | Path, tsv_path: str | Path) -> None:
|
|
# """
|
|
# Konversi data JSON (field: tokens, ner, srl, …) → TSV token\tNER\tSRL.
|
|
# Kalimat duplikat (urutan tokens persis sama) otomatis dilewati.
|
|
# Jika ada record yang tokens, ner, dan srl tidak sama panjang, akan diberi info error lengkap.
|
|
# """
|
|
# # ---------------------------------------------------------------------
|
|
# # 1. Baca semua record dari JSON
|
|
# # ---------------------------------------------------------------------
|
|
# with open(json_path, encoding="utf-8") as f:
|
|
# records = json.load(f)
|
|
|
|
# # ---------------------------------------------------------------------
|
|
# # 2. Tulis ke TSV, sambil mendeteksi duplikat
|
|
# # ---------------------------------------------------------------------
|
|
# seen_sentences: set[tuple[str, ...]] = set() # simpan tuple tokens unik
|
|
|
|
# with open(tsv_path, "w", encoding="utf-8", newline="") as f_out:
|
|
# writer = csv.writer(f_out, delimiter="\t", lineterminator="\n")
|
|
|
|
# for idx, rec in enumerate(records):
|
|
# tokens = rec.get("tokens")
|
|
# ner_tags = rec.get("ner")
|
|
# srl_tags = rec.get("srl")
|
|
|
|
# # -- cek panjang sama
|
|
# if not (len(tokens) == len(ner_tags) == len(srl_tags)):
|
|
# raise ValueError(
|
|
# f"❌ Panjang tidak sama di record index {idx}:\n"
|
|
# f" tokens ({len(tokens)}): {tokens}\n"
|
|
# f" ner ({len(ner_tags)}): {ner_tags}\n"
|
|
# f" srl ({len(srl_tags)}): {srl_tags}\n"
|
|
# )
|
|
|
|
# # -- cek duplikat kalimat
|
|
# key = tuple(tokens) # tuple hash-able
|
|
# if key in seen_sentences: # sudah pernah ditulis → skip
|
|
# continue
|
|
# seen_sentences.add(key)
|
|
|
|
# # -- tulis baris token, NER, SRL
|
|
# for tok, ner, srl in zip(tokens, ner_tags, srl_tags):
|
|
# writer.writerow([tok, ner, srl])
|
|
|
|
# # -- baris kosong pemisah antar-kalimat
|
|
# writer.writerow([])
|
|
|
|
# print(f"✔️ TSV selesai, simpan di: {tsv_path}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CONTOH PEMAKAIAN
|
|
# ---------------------------------------------------------------------------
|
|
if __name__ == "__main__":
|
|
json_to_tsv("QC/normalize_dataset.json", "QC/new_LNS.tsv")
|