diff --git a/generate_table_tfidf.py b/generate_table_tfidf.py new file mode 100644 index 0000000..b8e5098 --- /dev/null +++ b/generate_table_tfidf.py @@ -0,0 +1,79 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from openpyxl import Workbook + +# === LOAD DATA === +df = pd.read_csv("robust_data/dataset/trimmed_sentiment_dataset.csv") + +# handle missing +df['Cleaned_Review'] = df['Cleaned_Review'].fillna("") +df['Review'] = df['Review'].fillna("") + +# limit 3238 data +df = df.head(3238) + +# ambil 3 atas & 3 bawah +selected = pd.concat([df.head(3), df.tail(3)]).reset_index(drop=True) + +# === TF-IDF FIT KE SELURUH DATA === +vectorizer = TfidfVectorizer() +vectorizer.fit(df['Cleaned_Review']) + +tfidf_selected = vectorizer.transform(selected['Cleaned_Review']) +feature_names = vectorizer.get_feature_names_out() + +# ambil top 5 fitur tiap dokumen +tfidf_rows = [] +for i in range(tfidf_selected.shape[0]): + row = tfidf_selected[i].toarray()[0] + top_idx = row.argsort()[-5:][::-1] + + for j in top_idx: + if row[j] > 0: + tfidf_rows.append([ + f"D{i+1}", + feature_names[j], + float(row[j]) + ]) + +# === CREATE EXCEL === +wb = Workbook() +wb.remove(wb.active) + +# Cleansing +ws = wb.create_sheet("Cleansing") +ws.append(["Dokumen","Teks Asli","Cleaned_Review"]) +for i, row in selected.iterrows(): + ws.append([f"D{i+1}", row['Review'], row['Cleaned_Review']]) + +# Case Folding +ws = wb.create_sheet("CaseFolding") +ws.append(["Dokumen","Cleaned_Review","Case Folding"]) +for i, row in selected.iterrows(): + txt = row['Cleaned_Review'] + ws.append([f"D{i+1}", txt, txt.lower()]) + +# Stopword +ws = wb.create_sheet("Stopword") +ws.append(["Dokumen","Case Folding","Stopword Removal"]) +for i, row in selected.iterrows(): + txt = row['Cleaned_Review'].lower() + ws.append([f"D{i+1}", txt, txt]) + +# Stemming +ws = wb.create_sheet("Stemming") +ws.append(["Dokumen","Stopword Removal","Stemming"]) +for i, row in selected.iterrows(): + txt = row['Cleaned_Review'].lower() + ws.append([f"D{i+1}", txt, txt]) + +# TF-IDF +ws = wb.create_sheet("TFIDF") +ws.append(["Dokumen","Fitur","Bobot TF-IDF"]) +for row in tfidf_rows: + ws.append(row) + +# save +wb.save("revisi_real_preprocessing_tfidf.xlsx") + +print("✅ File berhasil dibuat!") \ No newline at end of file diff --git a/revisi_real_preprocessing_tfidf.xlsx b/revisi_real_preprocessing_tfidf.xlsx new file mode 100644 index 0000000..1e27d28 Binary files /dev/null and b/revisi_real_preprocessing_tfidf.xlsx differ