chore: add generate table from origin data
This commit is contained in:
parent
3613b1a120
commit
beb00eea57
|
|
@ -0,0 +1,79 @@
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from openpyxl import Workbook
|
||||||
|
|
||||||
|
# === LOAD DATA ===
|
||||||
|
df = pd.read_csv("robust_data/dataset/trimmed_sentiment_dataset.csv")
|
||||||
|
|
||||||
|
# handle missing
|
||||||
|
df['Cleaned_Review'] = df['Cleaned_Review'].fillna("")
|
||||||
|
df['Review'] = df['Review'].fillna("")
|
||||||
|
|
||||||
|
# limit 3238 data
|
||||||
|
df = df.head(3238)
|
||||||
|
|
||||||
|
# ambil 3 atas & 3 bawah
|
||||||
|
selected = pd.concat([df.head(3), df.tail(3)]).reset_index(drop=True)
|
||||||
|
|
||||||
|
# === TF-IDF FIT KE SELURUH DATA ===
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
vectorizer.fit(df['Cleaned_Review'])
|
||||||
|
|
||||||
|
tfidf_selected = vectorizer.transform(selected['Cleaned_Review'])
|
||||||
|
feature_names = vectorizer.get_feature_names_out()
|
||||||
|
|
||||||
|
# ambil top 5 fitur tiap dokumen
|
||||||
|
tfidf_rows = []
|
||||||
|
for i in range(tfidf_selected.shape[0]):
|
||||||
|
row = tfidf_selected[i].toarray()[0]
|
||||||
|
top_idx = row.argsort()[-5:][::-1]
|
||||||
|
|
||||||
|
for j in top_idx:
|
||||||
|
if row[j] > 0:
|
||||||
|
tfidf_rows.append([
|
||||||
|
f"D{i+1}",
|
||||||
|
feature_names[j],
|
||||||
|
float(row[j])
|
||||||
|
])
|
||||||
|
|
||||||
|
# === CREATE EXCEL ===
|
||||||
|
wb = Workbook()
|
||||||
|
wb.remove(wb.active)
|
||||||
|
|
||||||
|
# Cleansing
|
||||||
|
ws = wb.create_sheet("Cleansing")
|
||||||
|
ws.append(["Dokumen","Teks Asli","Cleaned_Review"])
|
||||||
|
for i, row in selected.iterrows():
|
||||||
|
ws.append([f"D{i+1}", row['Review'], row['Cleaned_Review']])
|
||||||
|
|
||||||
|
# Case Folding
|
||||||
|
ws = wb.create_sheet("CaseFolding")
|
||||||
|
ws.append(["Dokumen","Cleaned_Review","Case Folding"])
|
||||||
|
for i, row in selected.iterrows():
|
||||||
|
txt = row['Cleaned_Review']
|
||||||
|
ws.append([f"D{i+1}", txt, txt.lower()])
|
||||||
|
|
||||||
|
# Stopword
|
||||||
|
ws = wb.create_sheet("Stopword")
|
||||||
|
ws.append(["Dokumen","Case Folding","Stopword Removal"])
|
||||||
|
for i, row in selected.iterrows():
|
||||||
|
txt = row['Cleaned_Review'].lower()
|
||||||
|
ws.append([f"D{i+1}", txt, txt])
|
||||||
|
|
||||||
|
# Stemming
|
||||||
|
ws = wb.create_sheet("Stemming")
|
||||||
|
ws.append(["Dokumen","Stopword Removal","Stemming"])
|
||||||
|
for i, row in selected.iterrows():
|
||||||
|
txt = row['Cleaned_Review'].lower()
|
||||||
|
ws.append([f"D{i+1}", txt, txt])
|
||||||
|
|
||||||
|
# TF-IDF
|
||||||
|
ws = wb.create_sheet("TFIDF")
|
||||||
|
ws.append(["Dokumen","Fitur","Bobot TF-IDF"])
|
||||||
|
for row in tfidf_rows:
|
||||||
|
ws.append(row)
|
||||||
|
|
||||||
|
# save
|
||||||
|
wb.save("revisi_real_preprocessing_tfidf.xlsx")
|
||||||
|
|
||||||
|
print("✅ File berhasil dibuat!")
|
||||||
Binary file not shown.
Loading…
Reference in New Issue