55 lines
1.7 KiB
Python
55 lines
1.7 KiB
Python
import json
|
|
import re
|
|
from collections import OrderedDict
|
|
|
|
def normalize_question(text):
|
|
text = re.sub(r'\s+([?.!,])', r'\1', text)
|
|
return text.capitalize()
|
|
|
|
# Load data
|
|
with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
|
|
data = json.load(file)
|
|
|
|
processed_data = []
|
|
|
|
for idx_entry, entry in enumerate(data):
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
|
|
if "context" not in entry:
|
|
entry["context"] = " ".join(entry.get("tokens", []))
|
|
|
|
# Update NER tags: ubah 'V' menjadi 'O'
|
|
ner_tags = entry.get("ner", [])
|
|
entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]
|
|
|
|
for idx_qa, qa in enumerate(entry.get("qas", [])):
|
|
if "id" not in qa:
|
|
qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"
|
|
|
|
answer = qa.get("answer")
|
|
if isinstance(answer, list):
|
|
qa["answer"] = " ".join(answer)
|
|
|
|
question = qa.get("question")
|
|
if isinstance(question, list):
|
|
question_str = " ".join(question)
|
|
qa["question"] = normalize_question(question_str)
|
|
|
|
# Reorder fields: tokens first, then the rest
|
|
ordered_entry = OrderedDict()
|
|
if "context" in entry:
|
|
ordered_entry["context"] = entry.pop("context")
|
|
# Add remaining fields in their original order
|
|
for key, value in entry.items():
|
|
ordered_entry[key] = value
|
|
|
|
processed_data.append(ordered_entry)
|
|
|
|
# Save result
|
|
with open('data_converted.json', 'w', encoding='utf-8') as file:
|
|
json.dump(processed_data, file, indent=2, ensure_ascii=False)
|
|
|
|
# Optional: Print first 2 entries for quick verification
|
|
print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))
|