TIF_E41211115_lstm-quiz-gen.../question_generation/cnv_dts.py

55 lines
1.6 KiB
Python

import json
import re
from collections import OrderedDict
def normalize_question(text):
text = re.sub(r'\s+([?.!,])', r'\1', text)
return text.capitalize()
# Load data
with open('../dataset/dev_dataset_qg.json', 'r', encoding='utf-8') as file:
data = json.load(file)
processed_data = []
for idx_entry, entry in enumerate(data):
if not isinstance(entry, dict):
continue
if "context" not in entry:
entry["context"] = " ".join(entry.get("tokens", []))
# Update NER tags: ubah 'V' menjadi 'O'
ner_tags = entry.get("ner", [])
entry["ner"] = ["O" if tag == "V" else tag for tag in ner_tags]
for idx_qa, qa in enumerate(entry.get("qas", [])):
if "id" not in qa:
qa["id"] = f"qa_{idx_entry}_q{idx_qa + 1}"
answer = qa.get("answer")
if isinstance(answer, list):
qa["answer"] = " ".join(answer)
question = qa.get("question")
if isinstance(question, list):
question_str = " ".join(question)
qa["question"] = normalize_question(question_str)
# Reorder fields: tokens first, then the rest
ordered_entry = OrderedDict()
if "context" in entry:
ordered_entry["context"] = entry.pop("context")
# Add remaining fields in their original order
for key, value in entry.items():
ordered_entry[key] = value
processed_data.append(ordered_entry)
# Save result
with open('data_converted.json', 'w', encoding='utf-8') as file:
json.dump(processed_data, file, indent=2, ensure_ascii=False)
# Optional: Print first 2 entries for quick verification
print(json.dumps(processed_data[:2], indent=2, ensure_ascii=False))