import json
import re
from collections import OrderedDict

def normalize_question(text):
    text = re.sub(r'\s+([?.!,])', r'\1', text)
    return text.capitalize()

# Load data
with open('../dataset/stable_qg_qa_train_dataset.json.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

processed_data = []

for idx_entry, entry in enumerate(data):
    if not isinstance(entry, dict):
        continue

    if "context" not in entry:
        entry["context"] = " ".join(entry.get("tokens", []))

    # Update NER tags: ubah 'V' menjadi 'O'
    ner_tags = entry.get("ner", [])
    entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]

    for idx_qa, qa in enumerate(entry.get("qas", [])):
        if "id" not in qa:
            qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"

        answer = qa.get("answer")
        if isinstance(answer, list):
            qa["answer"] = " ".join(answer)

        question = qa.get("question")
        if isinstance(question, list):
            question_str = " ".join(question)
            qa["question"] = normalize_question(question_str)

    # Reorder fields: tokens first, then the rest
    ordered_entry = OrderedDict()
    if "context" in entry:
        ordered_entry["context"] = entry.pop("context")
    # Add remaining fields in their original order
    for key, value in entry.items():
        ordered_entry[key] = value

    processed_data.append(ordered_entry)

# Save result
with open('data_converted.json', 'w', encoding='utf-8') as file:
    json.dump(processed_data, file, indent=2, ensure_ascii=False)

# Optional: Print first 2 entries for quick verification
print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))