From beb00eea57ba732f7ea646a9fc762e6e76623d16 Mon Sep 17 00:00:00 2001 From: Mahen Date: Fri, 17 Apr 2026 20:30:39 +0700 Subject: [PATCH] chore: add generate table from origin data --- generate_table_tfidf.py | 79 +++++++++++++++++++++++++++ revisi_real_preprocessing_tfidf.xlsx | Bin 0 -> 13344 bytes 2 files changed, 79 insertions(+) create mode 100644 generate_table_tfidf.py create mode 100644 revisi_real_preprocessing_tfidf.xlsx diff --git a/generate_table_tfidf.py b/generate_table_tfidf.py new file mode 100644 index 0000000..b8e5098 --- /dev/null +++ b/generate_table_tfidf.py @@ -0,0 +1,79 @@ +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from openpyxl import Workbook + +# === LOAD DATA === +df = pd.read_csv("robust_data/dataset/trimmed_sentiment_dataset.csv") + +# handle missing +df['Cleaned_Review'] = df['Cleaned_Review'].fillna("") +df['Review'] = df['Review'].fillna("") + +# limit 3238 data +df = df.head(3238) + +# ambil 3 atas & 3 bawah +selected = pd.concat([df.head(3), df.tail(3)]).reset_index(drop=True) + +# === TF-IDF FIT KE SELURUH DATA === +vectorizer = TfidfVectorizer() +vectorizer.fit(df['Cleaned_Review']) + +tfidf_selected = vectorizer.transform(selected['Cleaned_Review']) +feature_names = vectorizer.get_feature_names_out() + +# ambil top 5 fitur tiap dokumen +tfidf_rows = [] +for i in range(tfidf_selected.shape[0]): + row = tfidf_selected[i].toarray()[0] + top_idx = row.argsort()[-5:][::-1] + + for j in top_idx: + if row[j] > 0: + tfidf_rows.append([ + f"D{i+1}", + feature_names[j], + float(row[j]) + ]) + +# === CREATE EXCEL === +wb = Workbook() +wb.remove(wb.active) + +# Cleansing +ws = wb.create_sheet("Cleansing") +ws.append(["Dokumen","Teks Asli","Cleaned_Review"]) +for i, row in selected.iterrows(): + ws.append([f"D{i+1}", row['Review'], row['Cleaned_Review']]) + +# Case Folding +ws = wb.create_sheet("CaseFolding") +ws.append(["Dokumen","Cleaned_Review","Case Folding"]) +for i, row in selected.iterrows(): + txt = row['Cleaned_Review'] + ws.append([f"D{i+1}", txt, txt.lower()]) + +# Stopword +ws = wb.create_sheet("Stopword") +ws.append(["Dokumen","Case Folding","Stopword Removal"]) +for i, row in selected.iterrows(): + txt = row['Cleaned_Review'].lower() + ws.append([f"D{i+1}", txt, txt]) + +# Stemming +ws = wb.create_sheet("Stemming") +ws.append(["Dokumen","Stopword Removal","Stemming"]) +for i, row in selected.iterrows(): + txt = row['Cleaned_Review'].lower() + ws.append([f"D{i+1}", txt, txt]) + +# TF-IDF +ws = wb.create_sheet("TFIDF") +ws.append(["Dokumen","Fitur","Bobot TF-IDF"]) +for row in tfidf_rows: + ws.append(row) + +# save +wb.save("revisi_real_preprocessing_tfidf.xlsx") + +print("✅ File berhasil dibuat!") \ No newline at end of file diff --git a/revisi_real_preprocessing_tfidf.xlsx b/revisi_real_preprocessing_tfidf.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1e27d289030ddfc4cac3237d96529074b846ce9e GIT binary patch literal 13344 zcmeHuWmsHY(k&JU5@_7r-QC??f(EzX?wa84ZoyrG6WrYiPH=bk+sVw_ypx%^|GxX@ zo5RC7&*|Q^p5A@d+EulyT0t5d0tEyTEAVAkAsU>e3k(>awGg+W5+wjwv84fNX49?F*78+Qm?pWySy9=b2K!+Zgp7VZ zQ(U@NxbBokq8>mD#_HXcCG5o?_!DD7LxLHzlK9piBS1JTSwCzCb z)0Afs6wYP%PHku%Ri9z?VB+Y>s_MiechZ%V5EB~zgQzTqXU=`n^|ZW2;evGxnv_;z z35a=^hAi^Hgc|^f+E?L_rb2@@e!=$iEUI^4bBdQy;?i63P9SnM#a25Dx_>mqcKHQa zJevmuud*M1lH%*GfPwnQ4rQN2MaL`$Ql5d^AC#Q(@Mp>%=uq2316o1xRe11OU1UOO zCo?gCyKFt|FIM`sPI-=e?ne__?GOtaqnocLXYW8jUSGjM6#hx6b*c=+S3vs70EGw( zRI0AMv84k&-OuxX75cwemVcRgS*)ylHv?SYsl-#z!0qfxG(bq&MNp!JP{rFMC`)O!tg(u=*fbeF6r7R2yg`2qEr93d{!OjVaip)My%&u&` z2hn-dA14rvH^oo0PgY~q;I0C6 zR!I4{Jg};PJ^LVTJk95OV!{49{$Ng-qp1|s0S5!qg$mCBOTz1WELCMQPV*{*OnWXO zH(evkj!TiWcI0PITDkNAB_bB2JEn2*e$wn~AI(~p^WhA)9(L&N(t)!<|1kWbRiM-Q zSCOn`B&T43gMb7;fq)5MN}nyWxcWAoD3CR2}TP# zyja;6qRm;NKA#h%(x8~BydeofDzr`0g0tBlj}5~S(ZeYKX$X*nP`e5z+=cotoYW!B z0M=3hCiEH-tY&-tQcb4V;&etGwEA5RofeXBUEH4~ydi`~cGGKOvP)xC-Jysy1X(8D zb5_oRmgnL7QHN+(d<+Y%fegl^Z}henV5Bl<(NGbJ)c~z&u!gcCeQG{)!8QBVokZYx zQyi;I8J~Oz>s#-VH-6O;dE87+IiI=v!%)JHb|E#5m8T|lMZV?$ue#Yruj0~}1-)$6 zmDGRB3#r^;YiGT4J01PV+f+y7-~pX-R0Q?g4rS%`H;)>bTpE9=g?n-t!G( z4Ys178rK?qo?oOLNU1a0gJgYB@SzBGX6vM@-A$75$!V(a(d=6`(!@%!Lqmra3wK-aX=mJD++pv<>)G9=eQ53$f~c~ zY1tf~n>IGz+^mmCFYflPHkyEApPBh26ZKy4LA;nbg1XEbL%k8N?c@iFgPtM2O# z01E5`0t5sLjgFF717L3YrMG{ z-?du51of5+OGB1~`T6{q`)xUD;+5DpXqmwD%Wb&9^}E@+Nhnv#bXB4VWJEt&?S*;G zkcP|4l~2LZwX{x$G!%>=p@!Zb4z7M`$oN9jR~XfyqNH$&uN0(ckX#Yfhn@C?_HENW z`IW|%VPO@5CXzKtFO`7J^k=1e_J7MK)gYeHV_@8}1YDLe zfFu6CEIXJP8#_AC|GF^#T%Xh9WTTb{0D-3o5BPvvjHR?Z9qO@?mDv)H(h~1OQ^QE=PIEv zR#x(14!%_z^T0z_Cf~qA-bQk5g;F-PeaNWp0;4JdGp4tC=kVsVw>A1erupQROokB^ zuil-f1GdCdOS1Q9go?hZg}rOXbiPri4a-salK=kIRLGDq^~pf4y4|zMj3wi29`$l! z?nzmZ*ZYS-lf=V)u|E2Td&~Gu7&~Uj4-~R;pFtpJDB#G<=evk8lrP7bm99>ENRoUv zWIT_v_QoYUba^eZ6F!Bk+1|dnY7TSMD$tOuBJnHXiU8Wj)55)yelCS@=yckzG)Ei6S=fa_2UZ^kyOc9R_9!pCi|8w@ zYr8pUBP8J62y3zUmYBoW;{t9~$*Dfng$ChUc#%;u^GV}|33j^W5#lj8<6&N&X^ioB zsvUWBU_SZ$e1#PFN-vM6^YZP0Fpu65kK9PNX{;zSgaNBk!COYZYUY^a2M;ezFb#HL zfU zk-wIuks2SNRvVC%rLIP1)1$39N`cY85VdUA-H=eN(rbwDN=kU?7%Nx}kR|K(DJp zi9J9-RppsaVq%)@&vhZd8!_uJwGK?zqKJJQM)*9G(cMu$3xlT+0cpu0fOEwZ{n|GD z=`|bE34TnX0W&B?h#6pz?}2-R9M7CvQdlQS{+_qDbr`02erz&Hx&48G`8Z3RNt5$x zKJfFv#*a($yM_%l2`7?_J|8qP>d&9v<09XM47KknvGLbj$j{HR>u)2XWeuz5T7P_@ z_p)Ke`bsgSkt%wg|GYaanoHTuNVSt)e($CB+_+=VlzgyT8Cd?xWLk~$J~*w&m5AA# zhe~dYPj36}NFBT!=kW}-fnTt)MW9ZX29~?n8uJiaLHliWjkY`&~vTj#yi!m z3Yv@;gX&vt$9B#_oBeT$My^8%?rYSL2F489KPe! zgMOul2+YVOimW;gF>ECa3byO0a=-~Dwp^xvy%c|p2qrNO2M+ig=P;2&F(AiA^hJdc z1>*+A9*GOZLTDj4SgBt@Z(eyof3GV!&SKW4ho1EM_WsS!2;u*e3PfQfTZ(`xF!|3a zVE(0oSXrB{pP5eF4gQy9^F^h4#ky?sw-HM$RxLp@`G~Sgq?GunH*33zpmB`^B`msf zcuVJLvMwcL*i~{^8PgPRDM4vv2(PIMv+yrFkx9oKbj}vd2KMoAGUQH9Ys#xF-h;aO zLmb;{x~=7fb1M36&@sF-tBvoYIhDh_sE0s(<11$Ce>REWG4J;*AhWa@s+c(4; zTYsl<<7qWC&S&`7Hqc1L?*X3xbra)}_4r(aFd?WXHAHAb>3rh*!^FmMW#7k>K9@hT zlV{*0^_6jVgROjCQFOL*UL8IkjRlihoqJia)D^6tidvNScl~VV93U6=43{ntKu9Se z_V)4ja4-!ir!pPjnugrH!p)9-Rnvkt~ za(`oYCz8YLS3h0ZvDhGrtNHZIZlgZ)51NT5p|MW(o;Fbof$eZSl{!@Si;{dVsTW#g zwnvI1mRr76u{2^}hSY;0JvO9%mh4-;%6G3KdX~7Tc6V0tI7{z1LouMj%QppZypgX) zA&6q*8IdrwLToD^9Q;s1-*CnWh{RI`f*FbvvJ2*@BTEqwN#P3nRp&+zXFny!oR%Q; z& zJwzPmI0RR^E?(cyC~@vql)T|u$!0?4Xl3f zg2tIEfq^4hIORm8j+x*(uww)OEbHxFd#TZ8Bnzo^C#CiK!vf;)kAUjvx~oStsSujgDug^x#XZKuBgxE}Ua(Rb$HeDpVCi=v} zVB1k7cr2Cw(@no{B_|zhVf@W={who(GlR%l`fdL66?dd8vGJ6yW--me}aQk zYqRhw=p9aZw2{?cfTLUQy-@vklZRja&tqI-wJ$DNBTRJ&!vFO{xT2vd5bLU2fR%Z zd|)JBn*{{xN;P{=(Epy^|2(y|=!g}QfLgjl{f`ko>)*l`X`4*~K>sQAH8;TtWBsO) z7PXveotBkN=`r6JM3f;4-UkBu@|%`Zfm9YHI>i{?aM0HmW`>ay7tNS=9Q5VpW-;6G zzKHb+CJPf;{F&kn@pm1v!wIYt6TGhO{O$KFOPUQAyaEIg3~f1HZ-qyb(Rs~In7MsB zQ^vSP!M&SFFF%2=7G%{L7-!0->`YC7=Qw|7u>39wDz8??nr!RKp}?c})Ry##f2y_- zvam_3JSe8hU7$yZ3iAe9Dg^C%cuKt=9p-Ws6&2lR_l)u!7vFg5E+96sOT(rmnlJ^| zpiSmAa)lAd@8PG)E)-OUs+e<-HO1|IAu2L*wR?DM9{?L~=Z15WJMB{7%jnwT+ee~Q z&-nL~>2*Gid={jcwj-e&dsq(&Gf|uAHTN{A6ZFtpK09$8=#|+55CSp&vFcf{7VquW zVd1xfugnES>=Cyh@dS$ly(t-ZUdf9^@JZ(-H<$Sjcnecsb|heNbxY4ytlRyyZ$JY| z%hL5~sx=RsUm-r|rF?N$lO)Sh6qhsZEkV(z3B!A~q!=2i7@ zyCefc1Ns8oQ~P@m5T9%zhs}s0*5F?Qa^|%t23qVFK@d8lTl*1U<+5gjMQ-R}8xRof zd&}D)X-npQ26*fe%fB)y?Sis;`DTW<=!LTP}A}t`*7v?cs;&3e15-u zeQb?hG3$K$xSM|i8+u+y?CsNz@R)Y{_&n+`2=COl7>FQimUIRQKu5|X6g&?Q;fm!l zJe9bE@F~|oelx_F7Dz$q1s6bh|8$k@`c~Bxhns>@fucT5gJS`S2z(}y_bZDs`uJRw$%hHLyMxZcba4hTa_itEmc zUf@{W?B-U_BIX=x`Cc$%>kvbfSeh7}@3{-1VrFsW!k2EUBm@=*u z){W&NngdRbMAqE97;jwN0s87o{{$Fh5@fU=wD=U%D7b72@C}FAoSTaN348Eu4Din} zRhP<|qTeev;}46`xSURSSj>$_?8(O4-mjOTfTg|-V008=BlRpPZzC<8K6nSapz*Mg zDKbj~Xmh~Rj$$!>&VQ(h*Q()~{qTgLb(7p1FEbK8iaCVgSfsqOt9gQ5Lsm|=J)KTS z+d^aCqdLghs8}CE?<(~yeeF7ys^W1=Olx#zfmTME5_(ZooOO6$ajkqi2UVfG!g7Ot zD{jyyq%_u;RD-ZQu6&|f@||)idbarHqJN&~)iP5^^daI0PAHK(^&g1VPRa0W(B-F}K=e<%iwHvl-c; z9Le%q{;7>{Go&~}&Ln4iuf7gUWp)sD_Kv_ZxpJF3#`<&_ZOJ{)iFDFl%}BYOnKvDy zZ>m5`l@HUr!!y9rF8JAB)_iA+uV4D-!-ju^`3n0&fz+)BfhxZw^}X_H#A&WQ0Oy&Q zNZ+3yNshGp0h9fZoXh9`XD+&5K-y>mEC?%O|11js+OBalGqy6O|8@SgQ*)#-YK_ef zXhYxQ26)KREW{dM#(+yFZm`)I%u9vh6zhCULUd^&pyX~w9jq4c`(i#R?YHVqu?Q*Q z`(dikF71n~-#4M4Eh$Y+6-#o+2XtIZ+S|>iaofOV_hZ4K2|~o!i!xLd`uAWWHPPZM zVOpIT-ltRUzIk05pE1WJsFMMh+Do3Vt);HQq$H$oZ%pmM8~jCzJq?XT1FOQfjWgrm zR^K&ARQrn~R*Hb(gRaDvm#I-jB%n;w2w=+j9UZSn2SQx44(d5beH;G4sH{Z)Sq8^! zs%JsJYO<&V(VmD~CR=z!6Z9g3s@%hrDNw-jRAuYtmtP=t&Ym~Sas#9P&NW+y1E_T7qv3vx;wF>d)d*> zTB%3HMtiXpy3E)#1uTMY+}~Vd8VxK&qPM}HT}y#x_N8#b}KTS^Avz`2)p|kP4KQ{I8er9~w+-Il9_gU{BW_tOK&v(DQJN_&% zkliFg-{yI-^p%{x?cwV76wl*_78t_N8J}#3NW%c);x6T$3EZZX9OQ;4_XBjn7<@X2 z6~can6wZ%wXcIo|w8p*`>TK6L-|zjmxEcsZjA}F8_hPYJ9XewgX|R=`)@C0_VQ>32 zj!fzLjFW`SBOQuTr?VX`*Cd09cksGXx)=w8g&uJ65jQ4J@bV*dT+AsN zw}b;u6leNoEMoa^Is91GYK4{0hIMhdUlI2~i60m*KkPmy<$&$;lZ z(;p;m?1qLQ+e1`0kT3Lzx|k}cQC+#n!gH-f@~)1XXa&|{r)~6#-hXH~t&uz+3iv{d zKbn?&$z-1v#RVr-5vR$Qdi6qU3J%nbB-lqz+o8FcuVtR z#R=r=eRB&<(zvG8?i;k!EjL}u<}F9_IG((4+#+aV_Wb=gQXDiz1Tz%iclU2)Aq(^s3)L6fyDD4)pDPwvz7Gcqh zl|k#g^9gD4=Xb~TKz1dCkO&=^Y$4}YKY z6a_z_k(G6!;fFICdBVbMVpW>^sns_YTO&IN(7;9 z5@<{O`Vn`;T@hwp!UvpkZjKP+g^jW!vch7}RzgVC+1b2zSrJ1(QR-eD(Jop<+^nO| z0RPjYQ;|O(VuCHVK9#?>Zz+qF)#VWFnCMgjeHGkTg<&FjBSiz3Xkm)wI$4w{Q!+Vq zIU<7N<&bqecRLl?HoZiGJ;v*}%H%GzHhP)pA~@ml7EHKfTRj+^N>>idHuO6WFuEq0 zV_!4BLv`18f^TjIciIVp9@`t-|k}S3MMkp91@A9tAk03)9Q;F3~5*YHRzK;*2QSoy&TtX6`E5 z#1)^u4|!@-5Ba&-`@D}b@~qTQm7MeAZZnIab=l`1pGU;4;FI_yUi~hkLZcV($*=qt zcOMnB5TEAsmZ%{Vy$0FvB+wp(uUB^O!*$f>tDTyruamK2Z;V0@Zp^p_F#3~q>xTGw&O5zquh_KDqf%mT zKE8r#pLfRV`;ho2b6A7 zwwwAU)>=wQsFWFrpdFr4N&-W7W;NwKk!-L&_sMp*=$zR$O^IVEPB65H7wpcsl+XM8 z*R2d;m6!#5Gx0aBL^7deQ$pBV0$h9Ar-~{ygd2%4ZA{BStSa`^_{-HUS0C{=NP<>9 z$j^D;!vci0gON+;kwL^`{5&HuRl6W@j|!jBrl^x^K?|ZD#jh48yimwDq!Z%oR+Htc}4SVVs` zrd;vLFl-cWgBC}I;YttzshI|sEBdGx=cjgj81O>bKhWg{X@$>PP-XKS}_kEo-+9UphtkV808Z@7b>|507hFQY9mvnQXe!TnM z?o$~GM>n?&<4Q7;GcOzxWvUw~p62JxpDW-o!W&Vz-cJJa>4GcVe=vT`f8mM?dWa-6 zX?k#jjxrAMb}M0AMBy$Y1KX5qiv+9;lZJDHDbU%rTqPu@6_$9 zjmHB%_fOF}rvd zd>y&a3QyW&=$3f$t2x`Q-G-vy7MIS)VH2V2n1~x1J!-Fs+}P&n@;zvMl3QNvy%;VO zr5qk#fttz@k!Mq5z3Qkaf|T3rlJ=31F&{;r!06^tAM91HugVrx-` zO55@zrmpRpKEF0L{w&5(fSRUC12ZlYptnH;_CFZe7%JG?*gDV~+SnWa^fthbyZ^R5 z0GF)r*ipG|2EgZ2iEqMN9ve>ruxcApzV2V%h7bYRhga(r4f|`}nXlrJn+sEM*j}9+ z?WWXS_6n+XqGM>yl?X_p&2VOLn4eKa_>usK}?T2S?@U z^gdl9?r(pCw`htnKmc|c3Wq6|(Qx#l-zvfqF|C228!+gZVMNkOcK(#x@@m=Uz=^Uo z+4~yOKq5`vgY0|LHtqxY42)*}HP+EfjjNq1Al&6cX zaaQ%}ndieSJ4y>bqpD5tM7_-l&HJ^aQglO1ve9=h-=`4J@y_b0H>UtRVY!Q+Q z=ZR6&yks-)Y^d*7jV6<;0uG9I@+bJKTF%k;FH1FED0dr}2Z39#x>vt@%Rs?sff3HX z|INWap5#BS|K=A63ex`!@XxK#|1kV{Edk2qFOAWE8vfZH`rC9IsKP&*MgKJZ=cc&d zrXV0_Fu#odLyO#>IDa-n{YEl@`@bLJUs|L7MESE{R={%5zxgb`s+@nK{8>W!jUoZ;8T&UVzY0o!qWrmc`5Ofl z?-$CSJDGn1{F!C`1}G!>1@JrP{L}Q$fd03sC;8t@{|xW{MEK{x