import json import re from collections import OrderedDict def normalize_question(text): text = re.sub(r'\s+([?.!,])', r'\1', text) return text.capitalize() # Load data with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file: data = json.load(file) processed_data = [] for idx_entry, entry in enumerate(data): if not isinstance(entry, dict): continue if "context" not in entry: entry["context"] = " ".join(entry.get("tokens", [])) # Update NER tags: ubah 'V' menjadi 'O' ner_tags = entry.get("ner", []) entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags] for idx_qa, qa in enumerate(entry.get("qas", [])): if "id" not in qa: qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}" answer = qa.get("answer") if isinstance(answer, list): qa["answer"] = " ".join(answer) question = qa.get("question") if isinstance(question, list): question_str = " ".join(question) qa["question"] = normalize_question(question_str) # Reorder fields: tokens first, then the rest ordered_entry = OrderedDict() if "context" in entry: ordered_entry["context"] = entry.pop("context") # Add remaining fields in their original order for key, value in entry.items(): ordered_entry[key] = value processed_data.append(ordered_entry) # Save result with open('data_converted.json', 'w', encoding='utf-8') as file: json.dump(processed_data, file, indent=2, ensure_ascii=False) # Optional: Print first 2 entries for quick verification print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))