74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
import json
|
|
import re
|
|
|
|
|
|
def tokenize(text):
|
|
"""Tokenisasi sederhana berbasis spasi dan tanda baca."""
|
|
text = re.sub(r"([.,!?])", r" \1", text)
|
|
return text.strip().split()
|
|
|
|
|
|
def normalize_entry(entry, idx):
|
|
"""Normalisasi satu entri ke format token-based, dengan detail error jika ada."""
|
|
required_keys = {"tokens", "ner", "srl", "question", "answer", "type"}
|
|
missing_keys = required_keys - entry.keys()
|
|
|
|
if missing_keys:
|
|
raise ValueError(f"index {idx}: missing keys {missing_keys}")
|
|
|
|
# Validasi tipe data
|
|
if not isinstance(entry["tokens"], list):
|
|
raise TypeError(f"index {idx}: 'tokens' harus list")
|
|
if not isinstance(entry["ner"], list):
|
|
raise TypeError(f"index {idx}: 'ner' harus list")
|
|
if not isinstance(entry["srl"], list):
|
|
raise TypeError(f"index {idx}: 'srl' harus list")
|
|
if not isinstance(entry["question"], (str, list)):
|
|
raise TypeError(f"index {idx}: 'question' harus string atau list")
|
|
if not isinstance(entry["answer"], (str, list)):
|
|
raise TypeError(f"index {idx}: 'answer' harus string atau list")
|
|
if not isinstance(entry["type"], str):
|
|
raise TypeError(f"index {idx}: 'type' harus string")
|
|
|
|
question = (
|
|
tokenize(entry["question"])
|
|
if isinstance(entry["question"], str)
|
|
else entry["question"]
|
|
)
|
|
answer = (
|
|
tokenize(entry["answer"])
|
|
if isinstance(entry["answer"], str)
|
|
else entry["answer"]
|
|
)
|
|
|
|
return {
|
|
"tokens": entry["tokens"],
|
|
"ner": entry["ner"],
|
|
"srl": entry["srl"],
|
|
"question": question,
|
|
"answer": answer,
|
|
"type": entry["type"],
|
|
}
|
|
|
|
|
|
# Load original data
|
|
with open("QC/dataset_qc_tokenized.json", encoding="utf-8") as f:
|
|
raw_data = json.load(f)
|
|
|
|
# Normalisasi semua entri
|
|
normalized_data = []
|
|
for idx, entry in enumerate(raw_data):
|
|
try:
|
|
normalized = normalize_entry(entry, idx)
|
|
normalized_data.append(normalized)
|
|
except Exception as e:
|
|
print(f"❌ Error pada index {idx}: {e}")
|
|
|
|
# Simpan ke file JSON baru
|
|
with open("QC/normalized_dataset.json", "w", encoding="utf-8") as f:
|
|
json.dump(normalized_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(
|
|
f"\n✅ Berhasil disimpan: {len(normalized_data)} entri ke 'QC/normalized_dataset.json'"
|
|
)
|