{ "cells": [ { "cell_type": "code", "execution_count": 32, "id": "2ae2c777-5a38-4a72-a3f5-34f309b3716a", "metadata": {}, "outputs": [], "source": [ "# 1.Import data dari csv" ] }, { "cell_type": "code", "execution_count": 84, "id": "aef4febf-8655-42c8-a7cc-5cc56caec1b8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 85, "id": "1fab43e8-27fa-4b6c-a342-7638094f505d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atid_strfull_textquote_countreply_countretweet_countfavorite_countlanguser_id_strconversation_id_strusernametweet_url
0Sat Feb 10 23:30:48 +0000 20241756460740138119204@geriisme Ini bisa basis partai, financing, ak...0000in16137429534739005461756382503865327916kwaier_https://twitter.com/kwaier_/status/17564607401...
1Sat Feb 10 23:30:23 +0000 20241756460633535766878Hallo semuanya!!!! Tak terasa Pemilihan Legisl...0000in11720895283932037121756460633535766878MudiBantenhttps://twitter.com/MudiBanten/status/17564606...
2Sat Feb 10 23:30:17 +0000 202417564606101064135452025 Anis presiden Kader partai pendukungny...0000in14826418585140224041756460610106413545jogjaethhttps://twitter.com/jogjaeth/status/1756460610...
3Sat Feb 10 23:30:00 +0000 20241756460540531208457@calunaeruby @losta_masta_ @peachyvann @mapedo...0001in14818827062336880641755207806284931256Blokshiahttps://twitter.com/Blokshia/status/1756460540...
4Sat Feb 10 23:29:44 +0000 20241756460470264090668Udah masuk masa tenang tapi cuma ngingetin, ca...0000in837420721756460470264090668ofuku89https://twitter.com/ofuku89/status/17564604702...
.......................................
541Fri Feb 09 23:11:32 +0000 20241756093502600147146@Rapdo5 @Miduk17 bukan hebat tp culas dan lici...0000in15387157178389749761755995641363534093MsElva10530471https://twitter.com/MsElva10530471/status/1756...
542Fri Feb 09 23:11:21 +0000 20241756093457985388779@KakekHalal Kalau melalui proses Pemilu Langsu...0004in14669797385602580491755993623945220130Aryadwiwarna1https://twitter.com/Aryadwiwarna1/status/17560...
543Fri Feb 09 23:10:00 +0000 20241756093116237389849Kenalan yuk dengan Caleg DPR RI Partai Demokra...0001in2848309331756093116237389849Demokrat_TVhttps://twitter.com/Demokrat_TV/status/1756093...
544Fri Feb 09 23:09:17 +0000 20241756092936113267194@sunprintme aku tempelin sticker gede ya, stic...0100in17425681636011663361756005949641462263vaneelyahttps://twitter.com/vaneelya/status/1756092936...
545Fri Feb 09 23:08:42 +0000 20241756092789165785325@Twillight_Loid @GusbacheV Hebat. Jokowi bener...0002in14925107428176814091755978097164517699AbdulRozakUBhttps://twitter.com/AbdulRozakUB/status/175609...
\n", "

546 rows × 12 columns

\n", "
" ], "text/plain": [ " created_at id_str \\\n", "0 Sat Feb 10 23:30:48 +0000 2024 1756460740138119204 \n", "1 Sat Feb 10 23:30:23 +0000 2024 1756460633535766878 \n", "2 Sat Feb 10 23:30:17 +0000 2024 1756460610106413545 \n", "3 Sat Feb 10 23:30:00 +0000 2024 1756460540531208457 \n", "4 Sat Feb 10 23:29:44 +0000 2024 1756460470264090668 \n", ".. ... ... \n", "541 Fri Feb 09 23:11:32 +0000 2024 1756093502600147146 \n", "542 Fri Feb 09 23:11:21 +0000 2024 1756093457985388779 \n", "543 Fri Feb 09 23:10:00 +0000 2024 1756093116237389849 \n", "544 Fri Feb 09 23:09:17 +0000 2024 1756092936113267194 \n", "545 Fri Feb 09 23:08:42 +0000 2024 1756092789165785325 \n", "\n", " full_text quote_count \\\n", "0 @geriisme Ini bisa basis partai, financing, ak... 0 \n", "1 Hallo semuanya!!!! Tak terasa Pemilihan Legisl... 0 \n", "2 2025 Anis presiden Kader partai pendukungny... 0 \n", "3 @calunaeruby @losta_masta_ @peachyvann @mapedo... 0 \n", "4 Udah masuk masa tenang tapi cuma ngingetin, ca... 0 \n", ".. ... ... \n", "541 @Rapdo5 @Miduk17 bukan hebat tp culas dan lici... 0 \n", "542 @KakekHalal Kalau melalui proses Pemilu Langsu... 0 \n", "543 Kenalan yuk dengan Caleg DPR RI Partai Demokra... 0 \n", "544 @sunprintme aku tempelin sticker gede ya, stic... 0 \n", "545 @Twillight_Loid @GusbacheV Hebat. Jokowi bener... 0 \n", "\n", " reply_count retweet_count favorite_count lang user_id_str \\\n", "0 0 0 0 in 1613742953473900546 \n", "1 0 0 0 in 1172089528393203712 \n", "2 0 0 0 in 1482641858514022404 \n", "3 0 0 1 in 1481882706233688064 \n", "4 0 0 0 in 83742072 \n", ".. ... ... ... ... ... \n", "541 0 0 0 in 1538715717838974976 \n", "542 0 0 4 in 1466979738560258049 \n", "543 0 0 1 in 284830933 \n", "544 1 0 0 in 1742568163601166336 \n", "545 0 0 2 in 1492510742817681409 \n", "\n", " conversation_id_str username \\\n", "0 1756382503865327916 kwaier_ \n", "1 1756460633535766878 MudiBanten \n", "2 1756460610106413545 jogjaeth \n", "3 1755207806284931256 Blokshia \n", "4 1756460470264090668 ofuku89 \n", ".. ... ... \n", "541 1755995641363534093 MsElva10530471 \n", "542 1755993623945220130 Aryadwiwarna1 \n", "543 1756093116237389849 Demokrat_TV \n", "544 1756005949641462263 vaneelya \n", "545 1755978097164517699 AbdulRozakUB \n", "\n", " tweet_url \n", "0 https://twitter.com/kwaier_/status/17564607401... \n", "1 https://twitter.com/MudiBanten/status/17564606... \n", "2 https://twitter.com/jogjaeth/status/1756460610... \n", "3 https://twitter.com/Blokshia/status/1756460540... \n", "4 https://twitter.com/ofuku89/status/17564604702... \n", ".. ... \n", "541 https://twitter.com/MsElva10530471/status/1756... \n", "542 https://twitter.com/Aryadwiwarna1/status/17560... \n", "543 https://twitter.com/Demokrat_TV/status/1756093... \n", "544 https://twitter.com/vaneelya/status/1756092936... \n", "545 https://twitter.com/AbdulRozakUB/status/175609... \n", "\n", "[546 rows x 12 columns]" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"data_test.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 86, "id": "e500eda1-311d-4691-9b41-553c57cac920", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_str
0Sat Feb 10 23:30:48 +0000 2024@geriisme Ini bisa basis partai, financing, ak...1613742953473900546
1Sat Feb 10 23:30:23 +0000 2024Hallo semuanya!!!! Tak terasa Pemilihan Legisl...1172089528393203712
2Sat Feb 10 23:30:17 +0000 20242025 Anis presiden Kader partai pendukungny...1482641858514022404
3Sat Feb 10 23:30:00 +0000 2024@calunaeruby @losta_masta_ @peachyvann @mapedo...1481882706233688064
4Sat Feb 10 23:29:44 +0000 2024Udah masuk masa tenang tapi cuma ngingetin, ca...83742072
............
541Fri Feb 09 23:11:32 +0000 2024@Rapdo5 @Miduk17 bukan hebat tp culas dan lici...1538715717838974976
542Fri Feb 09 23:11:21 +0000 2024@KakekHalal Kalau melalui proses Pemilu Langsu...1466979738560258049
543Fri Feb 09 23:10:00 +0000 2024Kenalan yuk dengan Caleg DPR RI Partai Demokra...284830933
544Fri Feb 09 23:09:17 +0000 2024@sunprintme aku tempelin sticker gede ya, stic...1742568163601166336
545Fri Feb 09 23:08:42 +0000 2024@Twillight_Loid @GusbacheV Hebat. Jokowi bener...1492510742817681409
\n", "

546 rows × 3 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:30:48 +0000 2024 \n", "1 Sat Feb 10 23:30:23 +0000 2024 \n", "2 Sat Feb 10 23:30:17 +0000 2024 \n", "3 Sat Feb 10 23:30:00 +0000 2024 \n", "4 Sat Feb 10 23:29:44 +0000 2024 \n", ".. ... \n", "541 Fri Feb 09 23:11:32 +0000 2024 \n", "542 Fri Feb 09 23:11:21 +0000 2024 \n", "543 Fri Feb 09 23:10:00 +0000 2024 \n", "544 Fri Feb 09 23:09:17 +0000 2024 \n", "545 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \n", "0 @geriisme Ini bisa basis partai, financing, ak... 1613742953473900546 \n", "1 Hallo semuanya!!!! Tak terasa Pemilihan Legisl... 1172089528393203712 \n", "2 2025 Anis presiden Kader partai pendukungny... 1482641858514022404 \n", "3 @calunaeruby @losta_masta_ @peachyvann @mapedo... 1481882706233688064 \n", "4 Udah masuk masa tenang tapi cuma ngingetin, ca... 83742072 \n", ".. ... ... \n", "541 @Rapdo5 @Miduk17 bukan hebat tp culas dan lici... 1538715717838974976 \n", "542 @KakekHalal Kalau melalui proses Pemilu Langsu... 1466979738560258049 \n", "543 Kenalan yuk dengan Caleg DPR RI Partai Demokra... 284830933 \n", "544 @sunprintme aku tempelin sticker gede ya, stic... 1742568163601166336 \n", "545 @Twillight_Loid @GusbacheV Hebat. Jokowi bener... 1492510742817681409 \n", "\n", "[546 rows x 3 columns]" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df[[ 'created_at','full_text','user_id_str']]\n", "df" ] }, { "cell_type": "code", "execution_count": 87, "id": "aa073590-2ad7-4e6e-bd1e-0eb5f1b7717c", "metadata": {}, "outputs": [], "source": [ "# 2.cleaning data" ] }, { "cell_type": "code", "execution_count": 88, "id": "78149731-c5e3-4cb4-a586-496c52963645", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(546, 3)" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 89, "id": "5eb1c631-73e1-4332-9810-dd3572d6832b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "33" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 90, "id": "f5ce7f88-13f7-4dd6-a10c-b44614150dee", "metadata": {}, "outputs": [], "source": [ "df = df.drop_duplicates(subset=['full_text'])" ] }, { "cell_type": "code", "execution_count": 91, "id": "a7dd506c-d8ed-4e2e-ba05-6a23b53cd881", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 92, "id": "1b0ef15a-be11-45ea-a7d1-29c908884f6a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "created_at 0\n", "full_text 0\n", "user_id_str 0\n", "dtype: int64" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 93, "id": "4cb70598-a556-4918-9302-db96c085dd67", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(513, 3)" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 94, "id": "cd4f2551-a87f-4bdc-a1ed-b0dc44b85421", "metadata": {}, "outputs": [], "source": [ "def clean_twitter_text(text):\n", " # Menghapus karakter @mentions, #hastag, dan url\n", " text = re.sub(r'@[A-Za-z0-9_]+', '', text)\n", " text = re.sub(r'#\\w+', '', text)\n", " text = re.sub(r'RT[\\s]+', '', text)\n", " text = re.sub(r'https?://\\S+', '', text)\n", "\n", " text = re.sub(r'[^A-Za-z0-9 ]', '', text)\n", " text = re.sub(r'\\s+', ' ', text)\n", "\n", " return text\n", "\n", "# Memanggil fungsi clean_twitter_text() setelah didefinisikan\n", "df.loc[:, 'full_text'] = df['full_text'].apply(clean_twitter_text)\n" ] }, { "cell_type": "code", "execution_count": 95, "id": "c200e99f-2770-41dd-818f-a11d93429436", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_str
0Sat Feb 10 23:30:48 +0000 2024ini bisa basis partai financing akses ke figu...1613742953473900546
1Sat Feb 10 23:30:23 +0000 2024hallo semuanya tak terasa pemilihan legislatif...1172089528393203712
2Sat Feb 10 23:30:17 +0000 20242025 anis presiden kader partai pendukungnya k...1482641858514022404
3Sat Feb 10 23:30:00 +0000 2024u think hal ky gini pure dr paslon 02 mereka ...1481882706233688064
4Sat Feb 10 23:29:44 +0000 2024udah masuk masa tenang tapi cuma ngingetin cal...83742072
............
541Fri Feb 09 23:11:32 +0000 2024bukan hebat tp culas dan licik di angkat dr n...1538715717838974976
542Fri Feb 09 23:11:21 +0000 2024kalau melalui proses pemilu langsung menurut ...1466979738560258049
543Fri Feb 09 23:10:00 +0000 2024kenalan yuk dengan caleg dpr ri partai demokra...284830933
544Fri Feb 09 23:09:17 +0000 2024aku tempelin sticker gede ya sticker partai1742568163601166336
545Fri Feb 09 23:08:42 +0000 2024hebat jokowi bener2 serius buat bangsa ini be...1492510742817681409
\n", "

513 rows × 3 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:30:48 +0000 2024 \n", "1 Sat Feb 10 23:30:23 +0000 2024 \n", "2 Sat Feb 10 23:30:17 +0000 2024 \n", "3 Sat Feb 10 23:30:00 +0000 2024 \n", "4 Sat Feb 10 23:29:44 +0000 2024 \n", ".. ... \n", "541 Fri Feb 09 23:11:32 +0000 2024 \n", "542 Fri Feb 09 23:11:21 +0000 2024 \n", "543 Fri Feb 09 23:10:00 +0000 2024 \n", "544 Fri Feb 09 23:09:17 +0000 2024 \n", "545 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \n", "0 ini bisa basis partai financing akses ke figu... 1613742953473900546 \n", "1 hallo semuanya tak terasa pemilihan legislatif... 1172089528393203712 \n", "2 2025 anis presiden kader partai pendukungnya k... 1482641858514022404 \n", "3 u think hal ky gini pure dr paslon 02 mereka ... 1481882706233688064 \n", "4 udah masuk masa tenang tapi cuma ngingetin cal... 83742072 \n", ".. ... ... \n", "541 bukan hebat tp culas dan licik di angkat dr n... 1538715717838974976 \n", "542 kalau melalui proses pemilu langsung menurut ... 1466979738560258049 \n", "543 kenalan yuk dengan caleg dpr ri partai demokra... 284830933 \n", "544 aku tempelin sticker gede ya sticker partai 1742568163601166336 \n", "545 hebat jokowi bener2 serius buat bangsa ini be... 1492510742817681409 \n", "\n", "[513 rows x 3 columns]" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[:, 'full_text'] = df['full_text'].str.lower()\n", "df" ] }, { "cell_type": "code", "execution_count": 96, "id": "6cf0e8d9-5364-4c8b-8d4e-5b6580b496d1", "metadata": {}, "outputs": [], "source": [ "# 3.prepocessing data" ] }, { "cell_type": "code", "execution_count": 97, "id": "6ce3bf2a-c24b-4dec-85f8-aaab10b1316f", "metadata": {}, "outputs": [], "source": [ "# 3.1 Normalisasi" ] }, { "cell_type": "code", "execution_count": 98, "id": "864462f3-d623-4c73-aa98-42b34adf39b4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_str
0Sat Feb 10 23:30:48 +0000 2024ini bisa basis partai financing akses ke figu...1613742953473900546
1Sat Feb 10 23:30:23 +0000 2024hallo semuanya tak terasa pemilihan legislatif...1172089528393203712
2Sat Feb 10 23:30:17 +0000 20242025 anis presiden kader partai pendukungnya k...1482641858514022404
3Sat Feb 10 23:30:00 +0000 2024u think hal ky gini pure dr paslon 02 mereka ...1481882706233688064
4Sat Feb 10 23:29:44 +0000 2024udah masuk masa tenang tapi cuma ngingetin cal...83742072
............
541Fri Feb 09 23:11:32 +0000 2024bukan hebat tp culas dan licik di angkat dr n...1538715717838974976
542Fri Feb 09 23:11:21 +0000 2024kalau melalui proses pemilu langsung menurut ...1466979738560258049
543Fri Feb 09 23:10:00 +0000 2024kenalan yuk dengan caleg dpr ri partai demokra...284830933
544Fri Feb 09 23:09:17 +0000 2024aku tempelin sticker gede ya sticker partai1742568163601166336
545Fri Feb 09 23:08:42 +0000 2024hebat jokowi bener2 serius buat bangsa ini be...1492510742817681409
\n", "

513 rows × 3 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:30:48 +0000 2024 \n", "1 Sat Feb 10 23:30:23 +0000 2024 \n", "2 Sat Feb 10 23:30:17 +0000 2024 \n", "3 Sat Feb 10 23:30:00 +0000 2024 \n", "4 Sat Feb 10 23:29:44 +0000 2024 \n", ".. ... \n", "541 Fri Feb 09 23:11:32 +0000 2024 \n", "542 Fri Feb 09 23:11:21 +0000 2024 \n", "543 Fri Feb 09 23:10:00 +0000 2024 \n", "544 Fri Feb 09 23:09:17 +0000 2024 \n", "545 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \n", "0 ini bisa basis partai financing akses ke figu... 1613742953473900546 \n", "1 hallo semuanya tak terasa pemilihan legislatif... 1172089528393203712 \n", "2 2025 anis presiden kader partai pendukungnya k... 1482641858514022404 \n", "3 u think hal ky gini pure dr paslon 02 mereka ... 1481882706233688064 \n", "4 udah masuk masa tenang tapi cuma ngingetin cal... 83742072 \n", ".. ... ... \n", "541 bukan hebat tp culas dan licik di angkat dr n... 1538715717838974976 \n", "542 kalau melalui proses pemilu langsung menurut ... 1466979738560258049 \n", "543 kenalan yuk dengan caleg dpr ri partai demokra... 284830933 \n", "544 aku tempelin sticker gede ya sticker partai 1742568163601166336 \n", "545 hebat jokowi bener2 serius buat bangsa ini be... 1492510742817681409 \n", "\n", "[513 rows x 3 columns]" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 99, "id": "50f2c622-b3d9-4914-b634-7a55ae358def", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_str
0Sat Feb 10 23:30:48 +0000 2024ini bisa basis partai financing akses ke figur...1613742953473900546
1Sat Feb 10 23:30:23 +0000 2024hallo semuanya tak terasa pemilihan legislatif...1172089528393203712
2Sat Feb 10 23:30:17 +0000 20242025 anis presiden kader partai pendukungnya k...1482641858514022404
3Sat Feb 10 23:30:00 +0000 2024u think hal ky begini pure dari pasangan calon...1481882706233688064
4Sat Feb 10 23:29:44 +0000 2024sudah masuk masa tenang tapi cuma ngingetin ca...83742072
............
541Fri Feb 09 23:11:32 +0000 2024bukan hebat tetapi culas dan licik di angkat d...1538715717838974976
542Fri Feb 09 23:11:21 +0000 2024kalau melalui proses pemilihan umum langsung m...1466979738560258049
543Fri Feb 09 23:10:00 +0000 2024kenalan yuk dengan calon legislatif dewan perw...284830933
544Fri Feb 09 23:09:17 +0000 2024aku tempelin sticker besar ya sticker partai1742568163601166336
545Fri Feb 09 23:08:42 +0000 2024hebat jokowi bener2 serius buat bangsa ini bel...1492510742817681409
\n", "

513 rows × 3 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:30:48 +0000 2024 \n", "1 Sat Feb 10 23:30:23 +0000 2024 \n", "2 Sat Feb 10 23:30:17 +0000 2024 \n", "3 Sat Feb 10 23:30:00 +0000 2024 \n", "4 Sat Feb 10 23:29:44 +0000 2024 \n", ".. ... \n", "541 Fri Feb 09 23:11:32 +0000 2024 \n", "542 Fri Feb 09 23:11:21 +0000 2024 \n", "543 Fri Feb 09 23:10:00 +0000 2024 \n", "544 Fri Feb 09 23:09:17 +0000 2024 \n", "545 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \n", "0 ini bisa basis partai financing akses ke figur... 1613742953473900546 \n", "1 hallo semuanya tak terasa pemilihan legislatif... 1172089528393203712 \n", "2 2025 anis presiden kader partai pendukungnya k... 1482641858514022404 \n", "3 u think hal ky begini pure dari pasangan calon... 1481882706233688064 \n", "4 sudah masuk masa tenang tapi cuma ngingetin ca... 83742072 \n", ".. ... ... \n", "541 bukan hebat tetapi culas dan licik di angkat d... 1538715717838974976 \n", "542 kalau melalui proses pemilihan umum langsung m... 1466979738560258049 \n", "543 kenalan yuk dengan calon legislatif dewan perw... 284830933 \n", "544 aku tempelin sticker besar ya sticker partai 1742568163601166336 \n", "545 hebat jokowi bener2 serius buat bangsa ini bel... 1492510742817681409 \n", "\n", "[513 rows x 3 columns]" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "norm = {\n", " 'emang': 'memang',\n", " 'yg': 'yang',\n", " 'tau': 'tahu',\n", " 'itu': 'itu',\n", " 'ngga': 'tidak',\n", " 'gak': 'tidak',\n", " 'udah': 'sudah',\n", " 'ngomong': 'berbicara',\n", " 'sampe': 'sampai',\n", " 'tsb': 'tersebut',\n", " 'aja': 'saja',\n", " 'gaess': 'teman-teman',\n", " 'gue': 'saya',\n", " 'bgt': 'banget',\n", " 'bro': 'sobat',\n", " 'loh': 'lho',\n", " 'klo': 'kalau',\n", " 'gtu': 'begitu',\n", " 'wkwkwk': 'tertawa',\n", " 'mbak': 'kakak perempuan',\n", " 'masnya': 'kakak laki-laki',\n", " 'nih': 'ini',\n", " 'sbg': 'sebagai',\n", " 'ampun': 'maaf',\n", " 'tp': 'tetapi',\n", " 'krna': 'karena',\n", " 'jd': 'jadi',\n", " 'kl': 'kalau',\n", " 'klh': 'kalah',\n", " 'bs': 'bisa',\n", " 'dlu': 'dulu',\n", " 'cm': 'hanya',\n", " 'ntn': 'menonton',\n", " 'blm': 'belum',\n", " 'klu': 'kalau',\n", " 'skrg': 'sekarang',\n", " 'mu': 'dirimu',\n", " 'kmu': 'kamu',\n", " 'dgn': 'dengan',\n", " 'nyinyir': 'bercanda',\n", " 'sosmed': 'media sosial',\n", " 'mk': 'mahkamah konstitusi',\n", " 'nyerah': 'menyerah',\n", " 'ngetren': 'populer',\n", " 'kwkwk': 'tertawa',\n", " 'klw': 'kalau',\n", " 'gmn': 'bagaimana',\n", " 'gaada': 'tidak ada',\n", " 'mjd': 'menjadi',\n", " 'yaa': 'ya',\n", " 'jg': 'juga',\n", " 'biar': 'agar',\n", " 'masi': 'masih',\n", " 'jgn': 'jangan',\n", " 'emg': 'memang',\n", " 'hmm': 'hmm',\n", " 'bodoamat': 'tidak peduli',\n", " 'kayak': 'seperti',\n", " 'apapun': 'apapun',\n", " 'ga': 'tidak',\n", " 'muji': 'memuji',\n", " 'td': 'tadi',\n", " 'napa': 'kenapa',\n", " 'ketum': 'ketua umum',\n", " 'ngegas': 'bersikap tegas',\n", " 'bener': 'benar',\n", " 'lg': 'lagi',\n", " 'skrng': 'sekarang',\n", " 'knp': 'kenapa',\n", " 'yaudah': 'ya sudah',\n", " 'tdk': 'tidak',\n", " 'pdhl': 'padahal',\n", " 'bngt': 'banget',\n", " 'kasian': 'kasihan',\n", " 'dasar': 'dasar',\n", " 'akuny': 'akunnya',\n", " 'kok': 'kenapa',\n", " 'paslon': 'pasangan calon',\n", " 'pemilu': 'pemilihan umum',\n", " 'pilpres': 'pemilihan presiden',\n", " 'caleg': 'calon legislatif',\n", " 'timses': 'tim sukses',\n", " 'tpi': 'tetapi',\n", " 'lah': 'lah',\n", " 'ngaku': 'mengaku',\n", " 'dpr': 'dewan perwakilan rakyat',\n", " 'dprd': 'dewan perwakilan rakyat daerah',\n", " 'dpd': 'dewan perwakilan daerah',\n", " 'ngasih': 'memberikan',\n", " 'doang': 'saja',\n", " 'pdip': 'partai demokrasi indonesia perjuangan',\n", " 'kepantasan': 'kelayakan',\n", " 'jaman': 'zaman',\n", " 'rebo': 'rabu',\n", " 'tmsk': 'termasuk',\n", " 'lu': 'kamu',\n", " 'palingan': 'paling',\n", " 'lebih': 'lebih',\n", " 'jelasin': 'jelaskan',\n", " 'ini': 'ini',\n", " 'kalimatnya': 'kalimatnya',\n", " 'sm': 'sama',\n", " 'sklrg': 'sekarang',\n", " 'diatas': 'di atas',\n", " 'bnyk': 'banyak',\n", " 'jd': 'jadi',\n", " 'bocor': 'bocor',\n", " 'sbb': 'sebab',\n", " 'bodoamat': 'tidak peduli',\n", " 'bgst': 'bangsat',\n", " 'pd': 'pada',\n", " 'sma': 'sama',\n", " 'bego': 'bodoh',\n", " 'sbgi': 'sebagai',\n", " 'blm': 'belum',\n", " 'knp': 'kenapa',\n", " 'gitu': 'begitu',\n", " 'lucu2': 'lucu-lucu',\n", " 'sih': 'sih',\n", " 'mikir': 'berpikir',\n", " 'lu': 'kamu',\n", " 'gini': 'begini',\n", " 'apaan': 'apa',\n", " 'kgk': 'tidak',\n", " 'dr': 'dari',\n", " 'tuk': 'untuk',\n", " 'nah': 'nah',\n", " 'yaudah': 'ya sudah',\n", " 'nntn': 'menonton',\n", " 'tahka': 'tahta',\n", " 'ngapa' : 'mengapa',\n", " 'isteri' : 'istri',\n", " 'alm' : 'almarhum',\n", " 'utk' : 'untuk',\n", " 'btw' : 'omong-omong',\n", " 'pks' : 'Partai Keadilan Sejahtera',\n", " 'ngebolehin' : 'mengizinkan',\n", " 'ttg': 'tentang',\n", " 'gede': 'besar',\n", " 'rebu': 'ribu',\n", "}\n", "\n", "def normalisasi_text(text):\n", " if isinstance(text, str):\n", " words = text.split()\n", " normalized_words = [norm[word] if word in norm else word for word in words]\n", " return ' '.join(normalized_words)\n", " else:\n", " return text\n", "\n", "# Melakukan normalisasi pada kolom 'full_text'\n", "df.loc[:, 'full_text'] = df['full_text'].apply(normalisasi_text)\n", "df\n" ] }, { "cell_type": "code", "execution_count": 100, "id": "c0e99d53-41b7-41b7-af54-1a61365545b1", "metadata": {}, "outputs": [], "source": [ "#3.2 Stopword" ] }, { "cell_type": "code", "execution_count": 101, "id": "a276f5b9-5033-4028-801a-f2ba96ca53ee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: Sastrawi in c:\\laragon\\bin\\python\\python-3.10\\lib\\site-packages (1.0.1)\n" ] } ], "source": [ "!pip install Sastrawi" ] }, { "cell_type": "code", "execution_count": 102, "id": "d08abf95-4264-41de-8075-5b37ce43847a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_str
0Sat Feb 10 23:30:48 +0000 2024bisa basis partai financing akses figure milit...1613742953473900546
1Sat Feb 10 23:30:23 +0000 2024hallo semuanya tak terasa pemilihan legislatif...1172089528393203712
2Sat Feb 10 23:30:17 +0000 20242025 anis presiden kader partai pendukungnya k...1482641858514022404
3Sat Feb 10 23:30:00 +0000 2024u think ky begini pure pasangan calon 02 kan p...1481882706233688064
4Sat Feb 10 23:29:44 +0000 2024masuk masa tenang cuma ngingetin calon legisla...83742072
............
541Fri Feb 09 23:11:32 +0000 2024bukan hebat culas licik angkat nol smpai berku...1538715717838974976
542Fri Feb 09 23:11:21 +0000 2024kalau melalui proses pemilihan umum langsung s...1466979738560258049
543Fri Feb 09 23:10:00 +0000 2024kenalan yuk calon legislatif dewan perwakilan ...284830933
544Fri Feb 09 23:09:17 +0000 2024aku tempelin sticker besar sticker partai1742568163601166336
545Fri Feb 09 23:08:42 +0000 2024hebat jokowi bener2 serius buat bangsa beliyau...1492510742817681409
\n", "

513 rows × 3 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:30:48 +0000 2024 \n", "1 Sat Feb 10 23:30:23 +0000 2024 \n", "2 Sat Feb 10 23:30:17 +0000 2024 \n", "3 Sat Feb 10 23:30:00 +0000 2024 \n", "4 Sat Feb 10 23:29:44 +0000 2024 \n", ".. ... \n", "541 Fri Feb 09 23:11:32 +0000 2024 \n", "542 Fri Feb 09 23:11:21 +0000 2024 \n", "543 Fri Feb 09 23:10:00 +0000 2024 \n", "544 Fri Feb 09 23:09:17 +0000 2024 \n", "545 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \n", "0 bisa basis partai financing akses figure milit... 1613742953473900546 \n", "1 hallo semuanya tak terasa pemilihan legislatif... 1172089528393203712 \n", "2 2025 anis presiden kader partai pendukungnya k... 1482641858514022404 \n", "3 u think ky begini pure pasangan calon 02 kan p... 1481882706233688064 \n", "4 masuk masa tenang cuma ngingetin calon legisla... 83742072 \n", ".. ... ... \n", "541 bukan hebat culas licik angkat nol smpai berku... 1538715717838974976 \n", "542 kalau melalui proses pemilihan umum langsung s... 1466979738560258049 \n", "543 kenalan yuk calon legislatif dewan perwakilan ... 284830933 \n", "544 aku tempelin sticker besar sticker partai 1742568163601166336 \n", "545 hebat jokowi bener2 serius buat bangsa beliyau... 1492510742817681409 \n", "\n", "[513 rows x 3 columns]" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary\n", "\n", "# Tambahkan stop words tambahan\n", "more_stop_words = [\"tidak\"]\n", "\n", "# Buat stop word remover factory\n", "factory = StopWordRemoverFactory()\n", "stop_words = factory.get_stop_words()\n", "stop_words.extend(more_stop_words)\n", "\n", "# Buat array dictionary dengan stop words baru\n", "new_array = ArrayDictionary(stop_words)\n", "stop_word_remover = StopWordRemover(new_array)\n", "\n", "# Fungsi untuk menghapus stop words\n", "def remove_stop_words(text):\n", " return stop_word_remover.remove(text)\n", "\n", "# Menggunakan .loc untuk menghindari SettingWithCopyWarning\n", "df.loc[:, 'full_text'] = df['full_text'].apply(remove_stop_words)\n", "df\n" ] }, { "cell_type": "code", "execution_count": 103, "id": "f2d91400-10b1-43da-a8ce-f30ea6b031c7", "metadata": {}, "outputs": [], "source": [ "# 3.3 Tokenize" ] }, { "cell_type": "code", "execution_count": 104, "id": "054bc36a-81c6-4a83-81a2-cb7440a65bcf", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\Rizqi\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "C:\\Users\\Rizqi\\AppData\\Local\\Temp\\ipykernel_1452\\2370743566.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df.loc[:, 'tokenized'] = df['full_text'].apply(word_tokenize)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
full_texttokenized
0bisa basis partai financing akses figure milit...[bisa, basis, partai, financing, akses, figure...
1hallo semuanya tak terasa pemilihan legislatif...[hallo, semuanya, tak, terasa, pemilihan, legi...
22025 anis presiden kader partai pendukungnya k...[2025, anis, presiden, kader, partai, pendukun...
3u think ky begini pure pasangan calon 02 kan p...[u, think, ky, begini, pure, pasangan, calon, ...
4masuk masa tenang cuma ngingetin calon legisla...[masuk, masa, tenang, cuma, ngingetin, calon, ...
.........
541bukan hebat culas licik angkat nol smpai berku...[bukan, hebat, culas, licik, angkat, nol, smpa...
542kalau melalui proses pemilihan umum langsung s...[kalau, melalui, proses, pemilihan, umum, lang...
543kenalan yuk calon legislatif dewan perwakilan ...[kenalan, yuk, calon, legislatif, dewan, perwa...
544aku tempelin sticker besar sticker partai[aku, tempelin, sticker, besar, sticker, partai]
545hebat jokowi bener2 serius buat bangsa beliyau...[hebat, jokowi, bener2, serius, buat, bangsa, ...
\n", "

513 rows × 2 columns

\n", "
" ], "text/plain": [ " full_text \\\n", "0 bisa basis partai financing akses figure milit... \n", "1 hallo semuanya tak terasa pemilihan legislatif... \n", "2 2025 anis presiden kader partai pendukungnya k... \n", "3 u think ky begini pure pasangan calon 02 kan p... \n", "4 masuk masa tenang cuma ngingetin calon legisla... \n", ".. ... \n", "541 bukan hebat culas licik angkat nol smpai berku... \n", "542 kalau melalui proses pemilihan umum langsung s... \n", "543 kenalan yuk calon legislatif dewan perwakilan ... \n", "544 aku tempelin sticker besar sticker partai \n", "545 hebat jokowi bener2 serius buat bangsa beliyau... \n", "\n", " tokenized \n", "0 [bisa, basis, partai, financing, akses, figure... \n", "1 [hallo, semuanya, tak, terasa, pemilihan, legi... \n", "2 [2025, anis, presiden, kader, partai, pendukun... \n", "3 [u, think, ky, begini, pure, pasangan, calon, ... \n", "4 [masuk, masa, tenang, cuma, ngingetin, calon, ... \n", ".. ... \n", "541 [bukan, hebat, culas, licik, angkat, nol, smpa... \n", "542 [kalau, melalui, proses, pemilihan, umum, lang... \n", "543 [kenalan, yuk, calon, legislatif, dewan, perwa... \n", "544 [aku, tempelin, sticker, besar, sticker, partai] \n", "545 [hebat, jokowi, bener2, serius, buat, bangsa, ... \n", "\n", "[513 rows x 2 columns]" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "import pandas as pd\n", "\n", "# Unduh data punkt\n", "nltk.download('punkt')\n", "# Menggunakan .loc untuk menghindari SettingWithCopyWarning\n", "df.loc[:, 'tokenized'] = df['full_text'].apply(word_tokenize)\n", "\n", "# Tampilkan hasil tokenisasi\n", "df[['full_text', 'tokenized']]\n" ] }, { "cell_type": "code", "execution_count": 105, "id": "12c8b002-54f9-4ac1-a1bb-b5ab33697ba3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sebelum stemming: bisa basis partai financing akses figure militansi sejarah budaya menarik kalau liat pemilihan umum philippines taun lalu rame banget milih leni robredo berhasil dapet simpati semua kalangan masih kalah marcosduterte but lets see on 14\n", "Sesudah stemming: bisa basis partai financing akses figure militansi sejarah budaya tarik kalau liat pilih umum philippines taun lalu rame banget milih leni robredo hasil dapet simpati semua kalang masih kalah marcosduterte but lets see on 14\n", "\n", "Sebelum stemming: hallo semuanya tak terasa pemilihan legislatif tinggal menghitung hari yah jangan lupa tanggal 14 februari datang tps coblos ricky kurniawan chairul calon legislatif dewan perwakilan rakyat daerah provinsi banten dapil tangerang c nomor urut 1 partai demokrat semangat\n", "Sesudah stemming: hallo semua tak asa pilih legislatif tinggal hitung hari yah jangan lupa tanggal 14 februari datang tps coblos ricky kurniawan chairul calon legislatif dewan wakil rakyat daerah provinsi banten dapil tangerang c nomor urut 1 partai demokrat semangat\n", "\n", "Sebelum stemming: 2025 anis presiden kader partai pendukungnya korupsi suporter anis kampret kamu korupsi kalau jelek bilang jelek kami cebokin ya tetep korupsinya\n", "Sesudah stemming: 2025 anis presiden kader partai dukung korupsi suporter anis kampret kamu korupsi kalau jelek bilang jelek kami cebokin ya tetep korupsi\n", "\n", "Sebelum stemming: u think ky begini pure pasangan calon 02 kan punya timsespunya partai dn mereka punya gmna konsep kampanye berkedok sedekah omong-omong literasi dh gk cuma konsep 02 0103 sama lagian dulu metode berbagi ky begini memang udh\n", "Sesudah stemming: u think ky begini pure pasang calon 02 kan punya timsespunya partai dn mereka punya gmna konsep kampanye kedok sedekah omong literasi dh gk cuma konsep 02 0103 sama lagi dulu metode bagi ky begini memang udh\n", "\n", "Sebelum stemming: masuk masa tenang cuma ngingetin calon legislatif tu anggota partai jadi akan bekerja tujuan partai fitrahnya memang dah saja selamat mencoblos\n", "Sesudah stemming: masuk masa tenang cuma ngingetin calon legislatif tu anggota partai jadi akan kerja tuju partai fitrah memang dah saja selamat coblos\n", "\n", "Sebelum stemming: komposisi non muslim besar 03 menang ln partai demokrasi indonesia perjuangan menang partai memberi cek kosong jokowi sekarag rasakaan akibatnya\n", "Sesudah stemming: komposisi non muslim besar 03 menang ln partai demokrasi indonesia juang menang partai beri cek kosong jokowi sekarag rasakaan akibat\n", "\n", "Sebelum stemming: bersama partai demokrat wujudkan kesejahteraan bersama bisa ricky kurniawan chairul dewan perwakilan rakyat daerah provinsi banten dapil tangerang c\n", "Sesudah stemming: sama partai demokrat wujud sejahtera sama bisa ricky kurniawan chairul dewan wakil rakyat daerah provinsi banten dapil tangerang c\n", "\n", "Sebelum stemming: sejarah mencatat ada pemilihan presiden didukung partai tersandera kasus korupsi melanggar etik berat paman mahkamah konstitusi paman kpu melakukan pembagian bansos bukan kepentingan rakyat ingat dosa ditanggung pendukung kezoliman\n", "Sesudah stemming: sejarah catat ada pilih presiden dukung partai sandera kasus korupsi langgar etik berat paman mahkamah konstitusi paman kpu laku bagi bansos bukan penting rakyat ingat dosa tanggung dukung kezoliman\n", "\n", "Sebelum stemming: apa diharapkan debat partai kosong begini mas partai tidak sengaja dibuat partai dibuat pelengkap rame2 an\n", "Sesudah stemming: apa harap debat partai kosong begini mas partai tidak sengaja buat partai buat lengkap rame2 an\n", "\n", "Sebelum stemming: allah selalu memengkan partai partai islam masuk parlemen jangan psi masuk\n", "Sesudah stemming: allah selalu kan partai partai islam masuk parlemen jangan psi masuk\n", "\n" ] } ], "source": [ "from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n", "import pandas as pd\n", "\n", "# Fungsi untuk melakukan stemming pada setiap kalimat\n", "def stemming(text):\n", " factory = StemmerFactory()\n", " stemmer = factory.create_stemmer()\n", " return stemmer.stem(text)\n", "\n", "# Mengambil kolom yang diperlukan\n", "df = df[['created_at', 'full_text', 'user_id_str']]\n", "\n", "# Melakukan stemming pada kolom 'full_text'\n", "df['stemmed_text'] = df['full_text'].apply(stemming)\n", "\n", "# Menyimpan hasil ke dalam file CSV dengan kolom yang diminta\n", "df.to_csv('data_test_fix.csv', index=False)\n", "\n", "# Cetak hasil sebelum dan sesudah stemming setelah 10 baris pertama di-Stem\n", "stemmed_df = df.head(10)\n", "for index, row in stemmed_df.iterrows():\n", " print(\"Sebelum stemming:\", row['full_text'])\n", " print(\"Sesudah stemming:\", row['stemmed_text'])\n", " print()" ] }, { "cell_type": "code", "execution_count": 61, "id": "52dfb7d0-ef5e-4926-b7e1-a055e1551138", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " created_at \\\n", "0 Sat Feb 10 23:59:55 +0000 2024 \n", "1 Sat Feb 10 23:59:51 +0000 2024 \n", "2 Sat Feb 10 23:59:46 +0000 2024 \n", "3 Sat Feb 10 23:59:34 +0000 2024 \n", "4 Sat Feb 10 23:59:28 +0000 2024 \n", "... ... \n", "1299 Fri Feb 09 23:25:58 +0000 2024 \n", "1300 Fri Feb 09 23:25:56 +0000 2024 \n", "1301 Fri Feb 09 23:23:59 +0000 2024 \n", "1302 Fri Feb 09 23:23:52 +0000 2024 \n", "1303 Fri Feb 09 23:23:29 +0000 2024 \n", "\n", " full_text user_id_str \\\n", "0 tahu partai nya gabener di pilih hadeh 1489144415021322242 \n", "1 bukan tahun 2000an awal orang2 Partai Keadilan... 1358428069845889026 \n", "2 imagine invalidating someones fear and calling... 1334684348184887297 \n", "3 jangan lupa yah teman-teman 2024 pilih partai ... 1679186924685385728 \n", "4 h3 pemilihan umum heran seakan2 jokowi dosanya... 171050686 \n", "... ... ... \n", "1299 tahu orang tidak pernah mengkampanyekan partai... 1172134585502588929 \n", "1300 zarr tahu hati kecil kamu mengakui yang benar ... 1017062768631898113 \n", "1301 pilih partai buruh eti 226128927 \n", "1302 coba kau tanya presiden jokowiyuwono ibu megaw... 1487765772130992129 \n", "1303 terkesan ketua partai politik presiden atas pr... 1466979738560258049 \n", "\n", " stemmed_text label_text_number \n", "0 tahu partai nya gabener di pilih hadeh negatif \n", "1 bukan tahun 2000an awal orang2 partai adil sej... negatif \n", "2 imagine invalidating someones fear and calling... netral \n", "3 jangan lupa yah teman 2024 pilih partai ummat ... positif \n", "4 h3 pilih umum heran seakan2 jokowi dosa paling... positif \n", "... ... ... \n", "1299 tahu orang tidak pernah kampanye partai nya do... negatif \n", "1300 zarr tahu hati kecil kamu aku yang benar ayo p... netral \n", "1301 pilih partai buruh eti positif \n", "1302 coba kau tanya presiden jokowiyuwono ibu megaw... negatif \n", "1303 kes ketua partai politik presiden atas preside... negatif \n", "\n", "[1304 rows x 5 columns]\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Baca file CSV ke DataFrame\n", "data = pd.read_csv(\"labeled_data.csv\", index_col=False)\n", "\n", "# Ubah nilai kolom label_text_number\n", "data['label_text_number'] = data['label_text_number'].replace({1: 'positif', 2: 'negatif', 3: 'netral'})\n", "\n", "# Tampilkan DataFrame yang sudah diubah\n", "print(data)\n", "\n", "# Simpan DataFrame yang sudah diubah ke file CSV baru\n", "data.to_csv('labeled_data_updated.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": 245, "id": "3765c433-6377-4f05-b957-666bbcbf7b2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion Matrix:\n", "[[149 21 11]\n", " [ 74 37 13]\n", " [ 43 12 32]]\n", "Validation Accuracy: 0.5561224489795918\n", "Prediksi untuk data uji telah disimpan dalam 'data_test_with_predictions_higher_accuracy.csv'\n" ] } ], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.model_selection import cross_val_score, train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.metrics import confusion_matrix\n", "\n", "# Baca data pelatihan dan data uji\n", "data_train = pd.read_csv('labeled_data_updated.csv', index_col=False)\n", "data_test = pd.read_csv('data_test_fix.csv', index_col=False)\n", "\n", "# Inisialisasi model Naive Bayes dengan smoothing parameter yang berbeda dan parameter lainnya\n", "model = MultinomialNB(alpha=0.5, fit_prior=False) # Contoh: menggunakan alpha=0.5 dan fit_prior=False\n", "\n", "# Pipeline untuk melakukan TfidfVectorizer dan pemodelan dengan Naive Bayes\n", "pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))), # Menambahkan pembuatan n-grams dan membatasi jumlah fitur\n", " ('clf', model)\n", "])\n", "\n", "# Pisahkan data pelatihan menjadi data pelatihan dan data validasi\n", "X_train, X_val, y_train, y_val = train_test_split(data_train['stemmed_text'], data_train['label_text'], test_size=0.3, random_state=42)\n", "\n", "# Lakukan pelatihan dengan data pelatihan dan evaluasi dengan data validasi\n", "pipeline.fit(X_train, y_train)\n", "\n", "# Prediksi label untuk data validasi\n", "val_predictions = pipeline.predict(X_val)\n", "\n", "# Tampilkan confusion matrix\n", "print(\"Confusion Matrix:\")\n", "print(confusion_matrix(y_val, val_predictions))\n", "\n", "# Hitung dan cetak akurasi\n", "accuracy = (val_predictions == y_val).mean()\n", "print(f'Validation Accuracy: {accuracy}')\n", "\n", "# Lakukan pelatihan dengan semua data pelatihan\n", "pipeline.fit(data_train['stemmed_text'], data_train['label_text'])\n", "\n", "# Prediksi label untuk data uji\n", "predictions = pipeline.predict(data_test['stemmed_text'])\n", "\n", "# Tambahkan kolom prediksi ke data uji\n", "data_test['label_text'] = predictions\n", "\n", "# Simpan hasil prediksi ke file baru\n", "data_test.to_csv('data_test_with_predictions_higher_accuracy.csv', index=False)\n", "\n", "print(\"Prediksi untuk data uji telah disimpan dalam 'data_test_with_predictions_higher_accuracy.csv'\")\n" ] }, { "cell_type": "code", "execution_count": 236, "id": "53ea2ced-d081-4ba1-aaac-91ca91f8de35", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion Matrix:\n", "[[149 21 11]\n", " [ 74 37 13]\n", " [ 43 12 32]]\n", "Validation Accuracy: 0.5561224489795918\n", "Prediksi untuk data uji telah disimpan dalam 'data_test_with_predictions_higher_accuracy.csv'\n" ] }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.metrics import confusion_matrix\n", "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\n", "# Baca data pelatihan dan data uji\n", "data_train = pd.read_csv('labeled_data_updated.csv', index_col=False)\n", "data_test = pd.read_csv('data_test_fix.csv', index_col=False)\n", "\n", "# Inisialisasi model Naive Bayes dengan smoothing parameter yang berbeda dan parameter lainnya\n", "model = MultinomialNB(alpha=0.5, fit_prior=False) # Contoh: menggunakan alpha=0.5 dan fit_prior=False\n", "\n", "# Pipeline untuk melakukan TfidfVectorizer dan pemodelan dengan Naive Bayes\n", "pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))), # Menambahkan pembuatan n-grams dan membatasi jumlah fitur\n", " ('clf', model)\n", "])\n", "\n", "# Pisahkan data pelatihan menjadi data pelatihan dan data validasi\n", "X_train, X_val, y_train, y_val = train_test_split(data_train['stemmed_text'], data_train['label_text'], test_size=0.3, random_state=42)\n", "\n", "# Lakukan pelatihan dengan data pelatihan dan evaluasi dengan data validasi\n", "pipeline.fit(X_train, y_train)\n", "\n", "# Prediksi label untuk data validasi\n", "val_predictions = pipeline.predict(X_val)\n", "\n", "# Tampilkan confusion matrix\n", "print(\"Confusion Matrix:\")\n", "print(confusion_matrix(y_val, val_predictions))\n", "\n", "# Hitung dan cetak akurasi\n", "accuracy = (val_predictions == y_val).mean()\n", "print(f'Validation Accuracy: {accuracy}')\n", "\n", "# Lakukan pelatihan dengan semua data pelatihan\n", "pipeline.fit(data_train['stemmed_text'], data_train['label_text'])\n", "\n", "# Prediksi label untuk data uji\n", "predictions = pipeline.predict(data_test['stemmed_text'])\n", "\n", "# Tambahkan kolom prediksi ke data uji\n", "data_test['label_text'] = predictions\n", "\n", "# Simpan hasil prediksi ke file baru\n", "data_test.to_csv('data_test_with_predictions_higher_accuracy.csv', index=False)\n", "\n", "print(\"Prediksi untuk data uji telah disimpan dalam 'data_test_with_predictions_higher_accuracy.csv'\")\n", "\n", "# Menggabungkan teks berdasarkan prediksi sentimen\n", "netral_text = \" \".join(data_test[data_test['label_text'] == 'netral']['stemmed_text'])\n", "positif_text = \" \".join(data_test[data_test['label_text'] == 'positif']['stemmed_text'])\n", "negatif_text = \" \".join(data_test[data_test['label_text'] == 'negatif']['stemmed_text'])\n", "\n", "# Fungsi untuk menghitung bobot TF-IDF dan membuat word cloud\n", "def generate_tfidf_wordcloud(texts, title, colormap, filename):\n", " vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))\n", " tfidf_matrix = vectorizer.fit_transform(texts)\n", " tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).tolist()[0]))\n", " wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colormap, collocations=False).generate_from_frequencies(tfidf_scores)\n", " plt.imshow(wordcloud, interpolation='bilinear')\n", " plt.axis('off')\n", " plt.title(title)\n", " plt.savefig(f\"{filename}.png\")\n", " plt.clf()\n", "\n", "# Menyimpan word cloud untuk setiap sentimen sebagai file terpisah\n", "generate_tfidf_wordcloud([netral_text], 'Word Cloud untuk Sentimen Netral', 'viridis', 'wordcloud_netral')\n", "generate_tfidf_wordcloud([positif_text], 'Word Cloud untuk Sentimen Positif', 'plasma', 'wordcloud_positif')\n", "generate_tfidf_wordcloud([negatif_text], 'Word Cloud untuk Sentimen Negatif', 'inferno', 'wordcloud_negatif')\n" ] }, { "cell_type": "code", "execution_count": 241, "id": "c15a558b-57c4-482b-bae4-e800240f2270", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confusion Matrix:\n", "[[138 27 16]\n", " [ 62 39 23]\n", " [ 40 13 34]]\n", "Validation Accuracy: 0.5382653061224489\n", "Prediksi untuk data uji telah disimpan dalam 'data_test_with_predictions_higher_accuracy.csv'\n", "Fitur TF-IDF telah disimpan dalam 'tfidf_features.csv'\n" ] }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.metrics import confusion_matrix\n", "import matplotlib.pyplot as plt\n", "from wordcloud import WordCloud\n", "\n", "# Baca data pelatihan dan data uji\n", "data_train = pd.read_csv('labeled_data_updated.csv', index_col=False)\n", "data_test = pd.read_csv('data_test_fix.csv', index_col=False)\n", "\n", "# Inisialisasi model Naive Bayes dengan smoothing parameter yang berbeda dan parameter lainnya\n", "model = MultinomialNB(alpha=0.5, fit_prior=False) # Contoh: menggunakan alpha=0.5 dan fit_prior=False\n", "\n", "# Pipeline untuk melakukan TfidfVectorizer dan pemodelan dengan Naive Bayes\n", "pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 1))), # Menggunakan unigrams saja\n", " ('clf', model)\n", "])\n", "\n", "# Pisahkan data pelatihan menjadi data pelatihan dan data validasi\n", "X_train, X_val, y_train, y_val = train_test_split(data_train['stemmed_text'], data_train['label_text'], test_size=0.3, random_state=42)\n", "\n", "# Lakukan pelatihan dengan data pelatihan dan evaluasi dengan data validasi\n", "pipeline.fit(X_train, y_train)\n", "\n", "# Prediksi label untuk data validasi\n", "val_predictions = pipeline.predict(X_val)\n", "\n", "# Tampilkan confusion matrix\n", "print(\"Confusion Matrix:\")\n", "print(confusion_matrix(y_val, val_predictions))\n", "\n", "# Hitung dan cetak akurasi\n", "accuracy = (val_predictions == y_val).mean()\n", "print(f'Validation Accuracy: {accuracy}')\n", "\n", "# Lakukan pelatihan dengan semua data pelatihan\n", "pipeline.fit(data_train['stemmed_text'], data_train['label_text'])\n", "\n", "# Prediksi label untuk data uji\n", "predictions = pipeline.predict(data_test['stemmed_text'])\n", "\n", "# Tambahkan kolom prediksi ke data uji\n", "data_test['label_text'] = predictions\n", "\n", "# Simpan hasil prediksi ke file baru\n", "data_test.to_csv('data_test_with_predictions_higher_accuracy.csv', index=False)\n", "\n", "print(\"Prediksi untuk data uji telah disimpan dalam 'data_test_with_predictions_higher_accuracy.csv'\")\n", "\n", "# Mengambil kata-kata yang digunakan dalam TfidfVectorizer dan bobot TF-IDF\n", "vectorizer = pipeline.named_steps['tfidf']\n", "feature_names = vectorizer.get_feature_names_out()\n", "tfidf_matrix = vectorizer.transform(data_train['stemmed_text'])\n", "tfidf_scores = tfidf_matrix.sum(axis=0).A1 # Sum TF-IDF scores across all documents\n", "\n", "# Membuat DataFrame dari fitur dan skor TF-IDF\n", "tfidf_df = pd.DataFrame({'Term': feature_names, 'TF-IDF': tfidf_scores})\n", "\n", "# Menyimpan DataFrame ke dalam file CSV\n", "tfidf_df.to_csv('tfidf_features.csv', index=False)\n", "\n", "print(\"Fitur TF-IDF telah disimpan dalam 'tfidf_features.csv'\")\n", "\n", "# Menggabungkan teks berdasarkan prediksi sentimen\n", "netral_text = \" \".join(data_test[data_test['label_text'] == 'netral']['stemmed_text'])\n", "positif_text = \" \".join(data_test[data_test['label_text'] == 'positif']['stemmed_text'])\n", "negatif_text = \" \".join(data_test[data_test['label_text'] == 'negatif']['stemmed_text'])\n", "\n", "# Fungsi untuk menghitung bobot TF-IDF dan membuat word cloud\n", "def generate_tfidf_wordcloud(texts, title, colormap, filename):\n", " vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 1)) # Menggunakan unigrams saja\n", " tfidf_matrix = vectorizer.fit_transform(texts)\n", " tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).tolist()[0]))\n", " wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colormap, collocations=False).generate_from_frequencies(tfidf_scores)\n", " plt.imshow(wordcloud, interpolation='bilinear')\n", " plt.axis('off')\n", " plt.title(title)\n", " plt.savefig(f\"{filename}.png\")\n", " plt.clf()\n", "\n", "# Menyimpan word cloud untuk setiap sentimen sebagai file terpisah\n", "generate_tfidf_wordcloud([netral_text], 'Word Cloud untuk Sentimen Netral', 'viridis', 'wordcloud_netral')\n", "generate_tfidf_wordcloud([positif_text], 'Word Cloud untuk Sentimen Positif', 'plasma', 'wordcloud_positif')\n", "generate_tfidf_wordcloud([negatif_text], 'Word Cloud untuk Sentimen Negatif', 'inferno', 'wordcloud_negatif')\n" ] }, { "cell_type": "code", "execution_count": 247, "id": "59deabbe-d901-4a35-bb4f-8cbf286549b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Jumlah data dengan sentimen:\n", "Netral: 96\n", "Positif: 87\n", "Negatif: 330\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Membaca data dari file CSV\n", "data = pd.read_csv(\"data_test_with_predictions_higher_accuracy.csv\")\n", "\n", "# Menghitung jumlah data dengan masing-masing sentimen\n", "sentiment_counts = data['label_text'].value_counts()\n", "\n", "# Mencetak jumlah data dengan masing-masing sentimen\n", "print(\"Jumlah data dengan sentimen:\")\n", "print(\"Netral:\", sentiment_counts['netral'])\n", "print(\"Positif:\", sentiment_counts['positif'])\n", "print(\"Negatif:\", sentiment_counts['negatif'])\n" ] }, { "cell_type": "code", "execution_count": 243, "id": "a8a3533c-bc44-4aed-afdf-373a87d3a092", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data telah digabungkan dan disimpan ke dalam 'combined_data.csv'\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Membaca data dari file CSV\n", "labeled_data = pd.read_csv(\"labeled_data_updated.csv\")\n", "predicted_data = pd.read_csv(\"data_test_with_predictions_higher_accuracy.csv\")\n", "\n", "# Menggabungkan data dengan menambahkan data dari file kedua di bawah data dari file pertama\n", "combined_data = pd.concat([labeled_data, predicted_data], ignore_index=True)\n", "\n", "# Menyimpan data gabungan ke dalam file CSV baru\n", "combined_data.to_csv(\"combined_data.csv\", index=False)\n", "\n", "print(\"Data telah digabungkan dan disimpan ke dalam 'combined_data.csv'\")\n" ] }, { "cell_type": "code", "execution_count": 246, "id": "07ab4476-8a71-4c8b-bcb7-05b7d643ca95", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Rizqi\\AppData\\Local\\Temp\\ipykernel_1452\\1594887086.py:17: FutureWarning: \n", "\n", "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.\n", "\n", " sns.barplot(x=sentiments, y=counts, palette='viridis')\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Membaca data dari file CSV\n", "data = pd.read_csv(\"combined_data.csv\")\n", "\n", "# Menghitung jumlah data dengan masing-masing sentimen\n", "sentiment_counts = data['label_text'].value_counts()\n", "\n", "# Menyiapkan data untuk visualisasi\n", "sentiments = ['netral', 'positif', 'negatif']\n", "counts = [sentiment_counts['netral'], sentiment_counts['positif'], sentiment_counts['negatif']]\n", "\n", "# Bar Chart\n", "plt.figure(figsize=(8, 6))\n", "sns.barplot(x=sentiments, y=counts, palette='viridis')\n", "plt.title('Distribusi Prediksi Sentimen - Bar Chart')\n", "plt.xlabel('Sentimen')\n", "plt.ylabel('Jumlah')\n", "plt.show()\n", "\n", "# Pie Chart\n", "plt.figure(figsize=(8, 6))\n", "plt.pie(counts, labels=sentiments, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis'))\n", "plt.title('Distribusi Prediksi Sentimen - Pie Chart')\n", "plt.show()\n", "\n", "# Histogram\n", "plt.figure(figsize=(8, 6))\n", "sns.histplot(data['label_text'], bins=3, kde=False, color='purple')\n", "plt.title('Distribusi Prediksi Sentimen - Histogram')\n", "plt.xlabel('Sentimen')\n", "plt.ylabel('Jumlah')\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 223, "id": "f275d06e-220a-4efb-8da8-a6b3afe3d8ac", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Menggabungkan teks berdasarkan prediksi sentimen\n", "netral_text = \" \".join(data[data['label_text'] == 'netral']['full_text'])\n", "positif_text = \" \".join(data[data['label_text'] == 'positif']['full_text'])\n", "negatif_text = \" \".join(data[data['label_text'] == 'negatif']['full_text'])\n", "\n", "# Membuat fungsi untuk menampilkan word cloud dan menyimpannya sebagai file PNG\n", "def generate_wordcloud(text, title, colormap, filename):\n", " wordcloud = WordCloud(width=800, height=400, background_color='white', colormap=colormap, collocations=False).generate(text)\n", " plt.imshow(wordcloud, interpolation='bilinear')\n", " plt.axis('off')\n", " plt.title(title)\n", " plt.savefig(f\"{filename}.png\")\n", " plt.clf()\n", "\n", "# Menyimpan word cloud untuk setiap sentimen sebagai file terpisah\n", "generate_wordcloud(netral_text, 'Word Cloud untuk Sentimen Netral', 'viridis', 'wordcloud_netral')\n", "generate_wordcloud(positif_text, 'Word Cloud untuk Sentimen Positif', 'plasma', 'wordcloud_positif')\n", "generate_wordcloud(negatif_text, 'Word Cloud untuk Sentimen Negatif', 'inferno', 'wordcloud_negatif')" ] }, { "cell_type": "code", "execution_count": 190, "id": "a5a51e45-bfc3-4f8e-a9de-2b517d77916e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_strstemmed_textlabel_text
0Sat Feb 10 23:59:55 +0000 2024tahu partai nya gabener di pilih hadeh1.489140e+18tahu partai nya gabener di pilih hadehnegatif
1Sat Feb 10 23:59:51 +0000 2024bukan tahun 2000an awal orang2 Partai Keadilan...1.358430e+18bukan tahun 2000an awal orang2 partai adil sej...negatif
2Sat Feb 10 23:59:46 +0000 2024imagine invalidating someones fear and calling...1.334680e+18imagine invalidating someones fear and calling...netral
3Sat Feb 10 23:59:34 +0000 2024jangan lupa yah teman-teman 2024 pilih partai ...1.679190e+18jangan lupa yah teman 2024 pilih partai ummat ...positif
4Sat Feb 10 23:59:28 +0000 2024h3 pemilihan umum heran seakan2 jokowi dosanya...1.710507e+08h3 pilih umum heran seakan2 jokowi dosa paling...positif
..................
1812Fri Feb 09 23:11:32 +0000 2024bukan hebat culas licik angkat nol smpai berku...1.538716e+18bukan hebat culas licik angkat nol smpai kuasa...negatif
1813Fri Feb 09 23:11:21 +0000 2024kalau melalui proses pemilihan umum langsung s...1.466980e+18kalau lalu proses pilih umum langsung saya per...negatif
1814Fri Feb 09 23:10:00 +0000 2024kenalan yuk calon legislatif dewan perwakilan ...2.848309e+08kenal yuk calon legislatif dewan wakil rakyat ...positif
1815Fri Feb 09 23:09:17 +0000 2024aku tempelin sticker besar sticker partai1.742568e+18aku tempelin sticker besar sticker partainetral
1816Fri Feb 09 23:08:42 +0000 2024hebat jokowi bener2 serius buat bangsa beliyau...1.492511e+18hebat jokowi bener2 serius buat bangsa beliyau...negatif
\n", "

1817 rows × 5 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:59:55 +0000 2024 \n", "1 Sat Feb 10 23:59:51 +0000 2024 \n", "2 Sat Feb 10 23:59:46 +0000 2024 \n", "3 Sat Feb 10 23:59:34 +0000 2024 \n", "4 Sat Feb 10 23:59:28 +0000 2024 \n", "... ... \n", "1812 Fri Feb 09 23:11:32 +0000 2024 \n", "1813 Fri Feb 09 23:11:21 +0000 2024 \n", "1814 Fri Feb 09 23:10:00 +0000 2024 \n", "1815 Fri Feb 09 23:09:17 +0000 2024 \n", "1816 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \\\n", "0 tahu partai nya gabener di pilih hadeh 1.489140e+18 \n", "1 bukan tahun 2000an awal orang2 Partai Keadilan... 1.358430e+18 \n", "2 imagine invalidating someones fear and calling... 1.334680e+18 \n", "3 jangan lupa yah teman-teman 2024 pilih partai ... 1.679190e+18 \n", "4 h3 pemilihan umum heran seakan2 jokowi dosanya... 1.710507e+08 \n", "... ... ... \n", "1812 bukan hebat culas licik angkat nol smpai berku... 1.538716e+18 \n", "1813 kalau melalui proses pemilihan umum langsung s... 1.466980e+18 \n", "1814 kenalan yuk calon legislatif dewan perwakilan ... 2.848309e+08 \n", "1815 aku tempelin sticker besar sticker partai 1.742568e+18 \n", "1816 hebat jokowi bener2 serius buat bangsa beliyau... 1.492511e+18 \n", "\n", " stemmed_text label_text \n", "0 tahu partai nya gabener di pilih hadeh negatif \n", "1 bukan tahun 2000an awal orang2 partai adil sej... negatif \n", "2 imagine invalidating someones fear and calling... netral \n", "3 jangan lupa yah teman 2024 pilih partai ummat ... positif \n", "4 h3 pilih umum heran seakan2 jokowi dosa paling... positif \n", "... ... ... \n", "1812 bukan hebat culas licik angkat nol smpai kuasa... negatif \n", "1813 kalau lalu proses pilih umum langsung saya per... negatif \n", "1814 kenal yuk calon legislatif dewan wakil rakyat ... positif \n", "1815 aku tempelin sticker besar sticker partai netral \n", "1816 hebat jokowi bener2 serius buat bangsa beliyau... negatif \n", "\n", "[1817 rows x 5 columns]" ] }, "execution_count": 190, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 224, "id": "b04835df-33ff-4714-bcd8-94924656c974", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Jumlah Buzzer Positif: 8\n", "Jumlah Buzzer Negatif: 9\n", "Jumlah Non Buzzer: 1800\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Membaca data dari file CSV\n", "df = pd.read_csv(\"combined_data.csv\")\n", "\n", "# Membuat variabel untuk menyimpan hasil labeling\n", "buzzer_types = []\n", "\n", "# Iterasi melalui setiap baris data\n", "for index, row in df.iterrows():\n", " # Menghitung jumlah kemunculan 'user_id_str' dan 'label_text' yang sama\n", " count = df[(df['user_id_str'] == row['user_id_str']) & (df['label_text'] == row['label_text'])].shape[0]\n", " \n", " # Menentukan buzzer type\n", " if count >= 4:\n", " if row['label_text'] == 'negatif':\n", " buzzer_type = 'buzzer negatif'\n", " elif row['label_text'] == 'positif':\n", " buzzer_type = 'buzzer positif'\n", " else:\n", " buzzer_type = 'non-buzzer'\n", " \n", " # Menambahkan buzzer type ke dalam list\n", " buzzer_types.append(buzzer_type)\n", "\n", "# Menambahkan kolom 'buzzer_type' ke DataFrame\n", "df['buzzer_type'] = buzzer_types\n", "\n", "# Menyimpan DataFrame ke file CSV\n", "df.to_csv('combined_data_with_buzzer_type.csv', index=False)\n", "\n", "# Membaca data dari file CSV yang sudah diberi labael\n", "labeled_df = pd.read_csv(\"combined_data_with_buzzer_type.csv\")\n", "\n", "# Menghitung jumlah 'buzzer positif', 'buzzer negatif', dan 'non-buzzer'\n", "buzzer_count = labeled_df['buzzer_type'].value_counts()\n", "\n", "# Menampilkan jumlah buzzer positif, negatif, dan non-buzzer\n", "print(\"Jumlah Buzzer Positif:\", buzzer_count.get('buzzer positif', 0))\n", "print(\"Jumlah Buzzer Negatif:\", buzzer_count.get('buzzer negatif', 0))\n", "print(\"Jumlah Non Buzzer:\", buzzer_count.get('non-buzzer', 0))\n" ] }, { "cell_type": "code", "execution_count": 230, "id": "89bf26f6-6197-4b4d-98e7-4ead87a0ef50", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
created_atfull_textuser_id_strstemmed_textlabel_textbuzzer_type
0Sat Feb 10 23:59:55 +0000 2024tahu partai nya gabener di pilih hadeh1.489140e+18tahu partai nya gabener di pilih hadehnegatifnon-buzzer
1Sat Feb 10 23:59:51 +0000 2024bukan tahun 2000an awal orang2 Partai Keadilan...1.358430e+18bukan tahun 2000an awal orang2 partai adil sej...negatifnon-buzzer
2Sat Feb 10 23:59:46 +0000 2024imagine invalidating someones fear and calling...1.334680e+18imagine invalidating someones fear and calling...netralnon-buzzer
3Sat Feb 10 23:59:34 +0000 2024jangan lupa yah teman-teman 2024 pilih partai ...1.679190e+18jangan lupa yah teman 2024 pilih partai ummat ...positifnon-buzzer
4Sat Feb 10 23:59:28 +0000 2024h3 pemilihan umum heran seakan2 jokowi dosanya...1.710507e+08h3 pilih umum heran seakan2 jokowi dosa paling...positifnon-buzzer
.....................
1812Fri Feb 09 23:11:32 +0000 2024bukan hebat culas licik angkat nol smpai berku...1.538716e+18bukan hebat culas licik angkat nol smpai kuasa...negatifnon-buzzer
1813Fri Feb 09 23:11:21 +0000 2024kalau melalui proses pemilihan umum langsung s...1.466980e+18kalau lalu proses pilih umum langsung saya per...negatifnon-buzzer
1814Fri Feb 09 23:10:00 +0000 2024kenalan yuk calon legislatif dewan perwakilan ...2.848309e+08kenal yuk calon legislatif dewan wakil rakyat ...positifbuzzer positif
1815Fri Feb 09 23:09:17 +0000 2024aku tempelin sticker besar sticker partai1.742568e+18aku tempelin sticker besar sticker partainetralnon-buzzer
1816Fri Feb 09 23:08:42 +0000 2024hebat jokowi bener2 serius buat bangsa beliyau...1.492511e+18hebat jokowi bener2 serius buat bangsa beliyau...negatifnon-buzzer
\n", "

1817 rows × 6 columns

\n", "
" ], "text/plain": [ " created_at \\\n", "0 Sat Feb 10 23:59:55 +0000 2024 \n", "1 Sat Feb 10 23:59:51 +0000 2024 \n", "2 Sat Feb 10 23:59:46 +0000 2024 \n", "3 Sat Feb 10 23:59:34 +0000 2024 \n", "4 Sat Feb 10 23:59:28 +0000 2024 \n", "... ... \n", "1812 Fri Feb 09 23:11:32 +0000 2024 \n", "1813 Fri Feb 09 23:11:21 +0000 2024 \n", "1814 Fri Feb 09 23:10:00 +0000 2024 \n", "1815 Fri Feb 09 23:09:17 +0000 2024 \n", "1816 Fri Feb 09 23:08:42 +0000 2024 \n", "\n", " full_text user_id_str \\\n", "0 tahu partai nya gabener di pilih hadeh 1.489140e+18 \n", "1 bukan tahun 2000an awal orang2 Partai Keadilan... 1.358430e+18 \n", "2 imagine invalidating someones fear and calling... 1.334680e+18 \n", "3 jangan lupa yah teman-teman 2024 pilih partai ... 1.679190e+18 \n", "4 h3 pemilihan umum heran seakan2 jokowi dosanya... 1.710507e+08 \n", "... ... ... \n", "1812 bukan hebat culas licik angkat nol smpai berku... 1.538716e+18 \n", "1813 kalau melalui proses pemilihan umum langsung s... 1.466980e+18 \n", "1814 kenalan yuk calon legislatif dewan perwakilan ... 2.848309e+08 \n", "1815 aku tempelin sticker besar sticker partai 1.742568e+18 \n", "1816 hebat jokowi bener2 serius buat bangsa beliyau... 1.492511e+18 \n", "\n", " stemmed_text label_text \\\n", "0 tahu partai nya gabener di pilih hadeh negatif \n", "1 bukan tahun 2000an awal orang2 partai adil sej... negatif \n", "2 imagine invalidating someones fear and calling... netral \n", "3 jangan lupa yah teman 2024 pilih partai ummat ... positif \n", "4 h3 pilih umum heran seakan2 jokowi dosa paling... positif \n", "... ... ... \n", "1812 bukan hebat culas licik angkat nol smpai kuasa... negatif \n", "1813 kalau lalu proses pilih umum langsung saya per... negatif \n", "1814 kenal yuk calon legislatif dewan wakil rakyat ... positif \n", "1815 aku tempelin sticker besar sticker partai netral \n", "1816 hebat jokowi bener2 serius buat bangsa beliyau... negatif \n", "\n", " buzzer_type \n", "0 non-buzzer \n", "1 non-buzzer \n", "2 non-buzzer \n", "3 non-buzzer \n", "4 non-buzzer \n", "... ... \n", "1812 non-buzzer \n", "1813 non-buzzer \n", "1814 buzzer positif \n", "1815 non-buzzer \n", "1816 non-buzzer \n", "\n", "[1817 rows x 6 columns]" ] }, "execution_count": 230, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 233, "id": "fd0caf23-2b67-449b-bef9-33b5d218eb37", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Membaca data dari file CSV yang sudah diberi label\n", "labeled_df = pd.read_csv(\"combined_data_with_buzzer_type.csv\")\n", "\n", "# Menghitung jumlah 'buzzer positif', 'buzzer negatif', dan 'non-buzzer'\n", "buzzer_count = labeled_df['buzzer_type'].value_counts()\n", "\n", "# Membuat data untuk visualisasi\n", "labels = ['Buzzer Positif', 'Buzzer Negatif', 'Non-Buzzer']\n", "counts = [\n", " buzzer_count.get('buzzer positif', 0),\n", " buzzer_count.get('buzzer negatif', 0),\n", " buzzer_count.get('non-buzzer', 0)\n", "]\n", "\n", "# Warna yang lebih lembut\n", "colors = ['#a3e4d7', '#f5b7b1', '#d6eaf8']\n", "\n", "# Membuat figure dengan 3 subplots (dalam satu baris)\n", "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n", "\n", "# Diagram Batang\n", "axes[0].bar(labels, counts, color=colors)\n", "axes[0].set_xlabel('Jenis Buzzer')\n", "axes[0].set_ylabel('Jumlah')\n", "axes[0].set_title('Jumlah Buzzer Positif, Negatif, dan Non-Buzzer')\n", "axes[0].tick_params(axis='x', rotation=45)\n", "\n", "# Diagram Pie\n", "axes[1].pie(counts, labels=labels, autopct='%1.1f%%', colors=colors)\n", "axes[1].set_title('Proporsi Buzzer Positif, Negatif, dan Non-Buzzer')\n", "\n", "# Diagram Garis\n", "axes[2].plot(labels, counts, marker='o', linestyle='-', color='#85c1e9')\n", "axes[2].set_xlabel('Jenis Buzzer')\n", "axes[2].set_ylabel('Jumlah')\n", "axes[2].set_title('Jumlah Buzzer Positif, Negatif, dan Non-Buzzer')\n", "axes[2].grid(True)\n", "\n", "# Menampilkan plot\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 234, "id": "378d4188-2b07-4703-a882-df094295fe3b", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Membaca data dari file CSV yang sudah diberi label\n", "labeled_df = pd.read_csv(\"combined_data_with_buzzer_type.csv\")\n", "\n", "# Menghitung jumlah tiap label_text\n", "label_count = labeled_df['label_text'].value_counts()\n", "\n", "# Membuat data untuk visualisasi\n", "labels = label_count.index.tolist()\n", "counts = label_count.values.tolist()\n", "\n", "# Warna yang lebih lembut\n", "colors = ['#a3e4d7', '#f5b7b1', '#d6eaf8', '#f7d6e0']\n", "\n", "# Membuat figure dengan 3 subplots (dalam satu baris)\n", "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n", "\n", "# Diagram Batang\n", "axes[0].bar(labels, counts, color=colors)\n", "axes[0].set_xlabel('Label Text')\n", "axes[0].set_ylabel('Jumlah')\n", "axes[0].set_title('Jumlah Tiap Label Text')\n", "axes[0].tick_params(axis='x', rotation=45)\n", "\n", "# Diagram Pie\n", "axes[1].pie(counts, labels=labels, autopct='%1.1f%%', colors=colors)\n", "axes[1].set_title('Proporsi Tiap Label Text')\n", "\n", "# Diagram Garis\n", "axes[2].plot(labels, counts, marker='o', linestyle='-', color='#85c1e9')\n", "axes[2].set_xlabel('Label Text')\n", "axes[2].set_ylabel('Jumlah')\n", "axes[2].set_title('Jumlah Tiap Label Text')\n", "axes[2].grid(True)\n", "\n", "# Menampilkan plot\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e23f8c01-3e68-4c7c-998a-b7b9929d2bf6", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }