import json import re def tokenize(text): """Tokenisasi sederhana berbasis spasi dan tanda baca.""" text = re.sub(r"([.,!?])", r" \1", text) return text.strip().split() def normalize_entry(entry, idx): """Normalisasi satu entri ke format token-based, dengan detail error jika ada.""" required_keys = {"tokens", "ner", "srl", "question", "answer", "type"} missing_keys = required_keys - entry.keys() if missing_keys: raise ValueError(f"index {idx}: missing keys {missing_keys}") # Validasi tipe data if not isinstance(entry["tokens"], list): raise TypeError(f"index {idx}: 'tokens' harus list") if not isinstance(entry["ner"], list): raise TypeError(f"index {idx}: 'ner' harus list") if not isinstance(entry["srl"], list): raise TypeError(f"index {idx}: 'srl' harus list") if not isinstance(entry["question"], (str, list)): raise TypeError(f"index {idx}: 'question' harus string atau list") if not isinstance(entry["answer"], (str, list)): raise TypeError(f"index {idx}: 'answer' harus string atau list") if not isinstance(entry["type"], str): raise TypeError(f"index {idx}: 'type' harus string") question = ( tokenize(entry["question"]) if isinstance(entry["question"], str) else entry["question"] ) answer = ( tokenize(entry["answer"]) if isinstance(entry["answer"], str) else entry["answer"] ) return { "tokens": entry["tokens"], "ner": entry["ner"], "srl": entry["srl"], "question": question, "answer": answer, "type": entry["type"], } # Load original data with open("QC/dataset_qc_tokenized.json", encoding="utf-8") as f: raw_data = json.load(f) # Normalisasi semua entri normalized_data = [] for idx, entry in enumerate(raw_data): try: normalized = normalize_entry(entry, idx) normalized_data.append(normalized) except Exception as e: print(f"āŒ Error pada index {idx}: {e}") # Simpan ke file JSON baru with open("QC/normalized_dataset.json", "w", encoding="utf-8") as f: json.dump(normalized_data, f, ensure_ascii=False, indent=2) print( f"\nāœ… Berhasil disimpan: {len(normalized_data)} entri ke 'QC/normalized_dataset.json'" )