100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
import pandas as pd
|
|
import os
|
|
|
|
def generate_final_grouped_sample(input_file, output_file, total_sample=356):
|
|
if not os.path.exists(input_file):
|
|
print(f"Error: File {input_file} tidak ditemukan!")
|
|
return
|
|
|
|
# 1. Load Dataset
|
|
df = pd.read_csv(input_file)
|
|
|
|
# 2. Definisikan Proporsi Sesuai Data Asli
|
|
proportions = {
|
|
'positif': 0.50,
|
|
'negatif': 0.307,
|
|
'netral': 0.193
|
|
}
|
|
|
|
print("=== Rincian Rencana Sampling (Stratified) ===")
|
|
print(f"Total Populasi: {len(df)} data")
|
|
print(f"Target Sampel: {total_sample} data\n")
|
|
|
|
# 3. Proses Sampling Per Kelas
|
|
sampled_list = []
|
|
summary_data = []
|
|
|
|
# Urutan tetap: Positif, Negatif, Netral
|
|
for sentiment in ['positif', 'negatif', 'netral']:
|
|
weight = proportions[sentiment]
|
|
|
|
# Hitung jumlah sampel untuk kelas ini
|
|
n_target = int(round(total_sample * weight))
|
|
|
|
# Ambil data dari dataset asli
|
|
class_data = df[df['Sentiment'] == sentiment]
|
|
n_available = len(class_data)
|
|
|
|
# Ambil sampel (n_target)
|
|
sample_class = class_data.sample(n=min(n_available, n_target), random_state=42)
|
|
sampled_list.append(sample_class)
|
|
|
|
# Simpan rincian untuk ditampilkan
|
|
summary_data.append({
|
|
'Kelas': sentiment.capitalize(),
|
|
'Proporsi (%)': f"{weight*100:.1f}%",
|
|
'Jumlah Target': n_target,
|
|
'Tersedia': n_available
|
|
})
|
|
|
|
# Gabungkan semua (berdasarkan urutan kelas, tidak di-shuffle)
|
|
sampled_df = pd.concat(sampled_list)
|
|
|
|
# Tampilkan Tabel Rincian di Terminal
|
|
summary_df = pd.DataFrame(summary_data)
|
|
print(summary_df.to_string(index=False))
|
|
print("-" * 45)
|
|
print(f"Total Data Berhasil Dibuat: {len(sampled_df)}\n")
|
|
|
|
# 4. Tambahkan Kolom Manual Label & Rapikan
|
|
sampled_df['Manual_Label'] = ""
|
|
cols = ['Username', 'Rating', 'Sentiment', 'Manual_Label', 'Review', 'Cleaned_Review', 'Date']
|
|
sampled_df = sampled_df[[c for c in cols if c in sampled_df.columns]]
|
|
|
|
# 5. Simpan ke Excel dengan Format Rapi
|
|
try:
|
|
writer = pd.ExcelWriter(output_file, engine='xlsxwriter')
|
|
sampled_df.to_excel(writer, index=False, sheet_name='Validasi_356')
|
|
|
|
workbook = writer.book
|
|
worksheet = writer.sheets['Validasi_356']
|
|
|
|
# Formatting
|
|
header_fmt = workbook.add_format({'bold': True, 'bg_color': '#D7E4BC', 'border': 1, 'align': 'center'})
|
|
wrap_fmt = workbook.add_format({'text_wrap': True, 'valign': 'top', 'font_size': 10})
|
|
input_fmt = workbook.add_format({'bg_color': '#FFFF00', 'border': 2, 'valign': 'top', 'bold': True})
|
|
|
|
# Atur Lebar Kolom
|
|
worksheet.set_column('A:A', 15) # Username
|
|
worksheet.set_column('B:C', 10) # Rating & Sentiment
|
|
worksheet.set_column('D:D', 18, input_fmt) # Manual_Label (Kuning)
|
|
worksheet.set_column('E:F', 55, wrap_fmt) # Review & Cleaned
|
|
worksheet.set_column('G:G', 15) # Date
|
|
|
|
worksheet.freeze_panes(1, 0) # Freeze baris pertama
|
|
|
|
for col_num, value in enumerate(sampled_df.columns.values):
|
|
worksheet.write(0, col_num, value, header_fmt)
|
|
|
|
writer.close()
|
|
print(f"File Berhasil Dibuat: {output_file}")
|
|
|
|
except Exception as e:
|
|
print(f"Gagal memformat Excel: {e}")
|
|
sampled_df.to_csv('backup_validasi.csv', index=False)
|
|
|
|
# Jalankan
|
|
generate_final_grouped_sample(
|
|
input_file='robust_data/dataset/trimmed_sentiment_dataset.csv',
|
|
output_file='Validasi_Manual_356_Proporsional.xlsx'
|
|
) |