1024 lines
27 KiB
Plaintext
1024 lines
27 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pengumpulan Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#@title Twitter Auth Token\n",
|
|
"\n",
|
|
"twitter_auth_token = 'ccb0a6dc47226f7ddc27a7a0975107094c8899e3'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Install Node.js\n",
|
|
"%sudo apt-get update\n",
|
|
"%sudo apt-get install -y ca-certificates curl gnupg\n",
|
|
"%sudo mkdir -p /etc/apt/keyrings\n",
|
|
"%curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg\n",
|
|
"\n",
|
|
"%NODE_MAJOR=20 && echo \"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main\" | sudo tee /etc/apt/sources.list.d/nodesource.list\n",
|
|
"\n",
|
|
"%sudo apt-get update\n",
|
|
"%sudo apt-get install nodejs -y\n",
|
|
"\n",
|
|
"%node -v"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Crawl Data\n",
|
|
"import os\n",
|
|
"\n",
|
|
"# List keyword untuk digunakan\n",
|
|
"keywords = [\n",
|
|
" 'gaji gen z lang:id until:2024-12-31 since:2020-01-01',\n",
|
|
" 'kesehatan mental generasi z lang:id until:2024-12-31 since:2020-01-01'\n",
|
|
" 'finansial gen z lang:id until:2024-12-31 since:2020-01-01'\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Filename untuk setiap keyword\n",
|
|
"filenames = [\n",
|
|
" 'dataset-gaji-gen-z.csv',\n",
|
|
" 'dataset_kesehatan_mental_generasi_z.csv',\n",
|
|
" 'dataset_finansial_gen_z.csv'\n",
|
|
"]\n",
|
|
"\n",
|
|
"# Limit data yang ingin dikumpulkan\n",
|
|
"limit = 50000\n",
|
|
"\n",
|
|
"# Lakukan crawling untuk setiap keyword\n",
|
|
"for keyword, filename in zip(keywords, filenames):\n",
|
|
" os.system(f'npx -y tweet-harvest@2.6.1 -o \"{filename}\" -s \"{keyword}\" --tab \"LATEST\" -l {limit} --token {twitter_auth_token}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pra-pemrosesan data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"# Membaca dataset\n",
|
|
"df1 = pd.read_csv('REAL-DATA/REAL-dataset_finansial_gen_z.csv')\n",
|
|
"df2 = pd.read_csv('REAL-DATA/REAL-dataset_gaji_gen_z.csv')\n",
|
|
"df3 = pd.read_csv('REAL-DATA/REAL-dataset_kesehatan_mental_generasi_z.csv')\n",
|
|
"df4 = pd.read_csv('data matang/dataset_finansial_gen_z.csv')\n",
|
|
"df5 = pd.read_csv('data matang/dataset_gaji_gen_z.csv')\n",
|
|
"df6 = pd.read_csv('data matang/dataset_kesehatan_mental_generasi_z-TGL.csv')\n",
|
|
"\n",
|
|
"# Menambahkan kolom keyword\n",
|
|
"df1['keyword'] = 'finansial gen z'\n",
|
|
"df2['keyword'] = 'gaji gen z'\n",
|
|
"df3['keyword'] = 'kesehatan mental generasi z'\n",
|
|
"df4['keyword'] = 'finansial gen z'\n",
|
|
"df5['keyword'] = 'gaji gen z'\n",
|
|
"df6['keyword'] = 'kesehatan mental generasi z'\n",
|
|
"\n",
|
|
"# Menggabungkan data set berdasarkan keyword\n",
|
|
"df_finansial = pd.concat([df1, df4], ignore_index=True)\n",
|
|
"df_gaji = pd.concat([df2, df5], ignore_index=True)\n",
|
|
"df_kesehatan_mental = pd.concat([df3, df6], ignore_index=True)\n",
|
|
"\n",
|
|
"# Menggabungkan semua dataframe menjadi satu\n",
|
|
"merge_df = pd.concat([df_finansial, df_gaji, df_kesehatan_mental], ignore_index=True)\n",
|
|
"\n",
|
|
"# Menyimpan ke CSV\n",
|
|
"merge_df.to_csv('data-analisis/datasets-kotor.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.read_csv('data-analisis/datasets-kotor.csv')\n",
|
|
"df.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.read_csv('data-analisis/datasets-kotor.csv')\n",
|
|
"df.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#drop column is not needed\n",
|
|
"df.drop(df.columns[[0,2,4,5,6,7,8,9,10,11,12,11,13,14]], axis=1, inplace=True)\n",
|
|
"\n",
|
|
"df.to_csv('data-analisis/datasets.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# drop row have duplicate value\n",
|
|
"df.drop_duplicates(subset=['full_text'], inplace=True)\n",
|
|
"\n",
|
|
"#drop row have missing value\n",
|
|
"df.dropna(subset=['full_text'], inplace=True)\n",
|
|
"\n",
|
|
"df.to_csv('data-analisis/datasets.csv', index=False)\n",
|
|
"df.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## cleansing data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"df = pd.read_csv('data-analisis/datasets.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# cleansing data\n",
|
|
"import re\n",
|
|
"import string\n",
|
|
"\n",
|
|
"def clean_text(text):\n",
|
|
" text = re.sub(r'@[A-Za-z0-9]+', '', text) # delete mention\n",
|
|
" text = re.sub(r'#', '', text) # delete hastag\n",
|
|
" text = re.sub(r'RT[\\s]+', '', text) # delate RT\n",
|
|
" text = re.sub(r'https?:\\/\\/\\S+', '', text) # delete hyperlink\n",
|
|
" text = re.sub(r'\\n', '', text) # delete new line\n",
|
|
" text = re.sub(r'\\d+', '', text) # delete number\n",
|
|
" text = re.sub(r'[^A-Za-z ]+', '', text) # delete non alphabet\n",
|
|
" \n",
|
|
" text = text.replace('…', '') # delete ellipsis\n",
|
|
" text = text.translate(str.maketrans('', '', string.punctuation)) # delete punctuation\n",
|
|
" text = text.strip() # delete space\n",
|
|
" return text\n",
|
|
"\n",
|
|
"df ['cleanning_text'] = df['full_text'].apply(clean_text)\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## case folding"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# case folding\n",
|
|
"def case_folding(text):\n",
|
|
" text = text.lower() # change to lower case\n",
|
|
" return text\n",
|
|
"\n",
|
|
"df['case_folding'] = df['cleanning_text'].apply(case_folding)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## convert slang word"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# convert slang word\n",
|
|
"slang_words = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/refs/heads/master/colloquial-indonesian-lexicon.csv')\n",
|
|
"slang_words_dict = dict(zip(slang_words['slang'], slang_words['formal']))\n",
|
|
"\n",
|
|
"# Fungsi untuk mengonversi slang word\n",
|
|
"def convert_slang_word(text):\n",
|
|
" return ' '.join([slang_words_dict.get(word, word) for word in text.split()])\n",
|
|
"\n",
|
|
"# Menerapkan fungsi ke kolom 'case_folding'\n",
|
|
"df['convert_slang_word'] = df['case_folding'].apply(convert_slang_word)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Stop word"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# Stop word\n",
|
|
"# from nltk.corpus import stopwords\n",
|
|
"\n",
|
|
"# def filtering(text):\n",
|
|
"# stop_words = set(stopwords.words('indonesian'))\n",
|
|
"# word_tokens = text.split()\n",
|
|
"# text = [word for word in word_tokens if word not in stop_words]\n",
|
|
"# text = ' '.join(text)\n",
|
|
"# return text\n",
|
|
"\n",
|
|
"# df['filtering'] = df['convert_slang_word'].apply(filtering)\n",
|
|
"# df\n",
|
|
"\n",
|
|
"# Stop word\n",
|
|
"from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory\n",
|
|
"\n",
|
|
"def filtering(text):\n",
|
|
" factory = StopWordRemoverFactory()\n",
|
|
" stop_words = set(factory.get_stop_words())\n",
|
|
" word_tokens = text.split()\n",
|
|
" text = [word for word in word_tokens if word not in stop_words]\n",
|
|
" text = ' '.join(text)\n",
|
|
" return text\n",
|
|
"\n",
|
|
"df['filtering'] = df['convert_slang_word'].apply(filtering)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## tokenizing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import nltk\n",
|
|
"# nltk.download('punkt')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# tokenizing\n",
|
|
"from nltk.tokenize import word_tokenize\n",
|
|
"\n",
|
|
"def tokenizing(text):\n",
|
|
" text = word_tokenize(text)\n",
|
|
" return text\n",
|
|
"\n",
|
|
"df['tokenizing'] = df['filtering'].apply(tokenizing)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## stemming"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# stemming\n",
|
|
"from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n",
|
|
"\n",
|
|
"# create stemmer\n",
|
|
"factory = StemmerFactory()\n",
|
|
"stemmer = factory.create_stemmer()\n",
|
|
"\n",
|
|
"def stem_text(tokens):\n",
|
|
" text = ' '.join(tokens) # Join the tokens back into a single string\n",
|
|
" return stemmer.stem(text)\n",
|
|
"\n",
|
|
"df['stemming'] = df['tokenizing'].apply(stem_text)\n",
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#save to csv\n",
|
|
"df.to_csv('data-analisis/datasets.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.drop(df.columns[[1, 2, 3, 4, 5, 6]], axis=1, inplace=True)\n",
|
|
"\n",
|
|
"# Rename column 'stemming' to 'full_text'\n",
|
|
"df.rename(columns={'stemming': 'full_text'}, inplace=True)\n",
|
|
"\n",
|
|
"df.to_csv('data-analisis/datasets-clean.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Pelabelan Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"df = pd.read_csv('data-analisis/datasets-clean.csv')\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"#unduh kamus inset lexicon positif dan negatif\n",
|
|
"positive_url = \"https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv\"\n",
|
|
"negative_url = \"https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv\"\n",
|
|
"\n",
|
|
"positive_lexicon = set(pd.read_csv(positive_url, sep='\\t', header=None)[0])\n",
|
|
"negative_lexicon = set(pd.read_csv(negative_url, sep='\\t', header=None)[0])\n",
|
|
"\n",
|
|
"#fungsi menghitung skor sentimen\n",
|
|
"def determine_sentiment(text):\n",
|
|
" if isinstance(text, str):\n",
|
|
" positive_count = sum(1 for word in text.split() if word in positive_lexicon)\n",
|
|
" negative_count = sum(1 for word in text.split() if word in negative_lexicon)\n",
|
|
" sentiment_score = positive_count - negative_count\n",
|
|
" if sentiment_score > 0:\n",
|
|
" sentiment = 'Positif'\n",
|
|
" elif sentiment_score < 0:\n",
|
|
" sentiment = 'Negatif'\n",
|
|
" else:\n",
|
|
" sentiment = 'Netral'\n",
|
|
" return sentiment_score, sentiment\n",
|
|
" return 0, \"netral\"\n",
|
|
" \n",
|
|
"#menerapkan perhitungan ke datasets\n",
|
|
"df[['score', 'label']] = df['full_text'].apply(lambda x: pd.Series(determine_sentiment(x)))\n",
|
|
"\n",
|
|
"df.to_csv('data-analisis/datasets-label.csv', index=False)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# Menghitung jumlah label\n",
|
|
"df['label'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Ekstraksi Fitur"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"#Membaca dataset yang sudah diberi label\n",
|
|
"df = pd.read_csv('data-analisis/datasets-label.csv')\n",
|
|
"df['label'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"versi yutub"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"#menggunakan TFidVectorizer untuk menghitung TF - IDF\n",
|
|
"tfidf_vectorizer = TfidfVectorizer()\n",
|
|
"tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_text'])\n",
|
|
"\n",
|
|
"#menghitung IDF\n",
|
|
"term = tfidf_vectorizer.get_feature_names_out()\n",
|
|
"idf = np.log(tfidf_matrix.shape[0] / (np.count_nonzero(tfidf_matrix.toarray(), axis=0) + 1))\n",
|
|
"\n",
|
|
"#membuat data frame untuk menyimpan term dan idf\n",
|
|
"tfidf_df = pd.DataFrame({'term' : term, 'idf' : idf})\n",
|
|
"\n",
|
|
"#tambah ke kolom TF data frame\n",
|
|
"for i, doc in enumerate(df['full_text']):\n",
|
|
" tf = tfidf_matrix[i].toarray().flatten()\n",
|
|
" tfidf_df[f'tf_{i}'] = tf\n",
|
|
" \n",
|
|
"tfidf_df.to_csv('REAL-DATA/datasets-tfidfshow.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"versi riset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"\n",
|
|
"df = pd.read_csv('data-analisis/datasets-label.csv')\n",
|
|
"# Convert text to vectors using TF-IDF\n",
|
|
"tfidf_vectorizer = TfidfVectorizer()\n",
|
|
"x = tfidf_vectorizer.fit_transform(df['full_text'])\n",
|
|
"\n",
|
|
"tfidf = x.toarray()\n",
|
|
"print(tfidf[:1])\n",
|
|
"\n",
|
|
"# Convert the array to a DataFrame\n",
|
|
"tfidf_df = pd.DataFrame(tfidf)\n",
|
|
"\n",
|
|
"# Save the DataFrame to a CSV file\n",
|
|
"tfidf_df.to_csv('data-analisis/datasets-tfidf.csv', index=False)\n",
|
|
"\n",
|
|
"# save to pickle\n",
|
|
"# tfidf_df.to_pickle('REAL-DATA/datasets-tfidf.pkl')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Data Balancing"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df.isna().sum() # to check null values"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"tfidf_data = pd.read_csv(\"data-analisis/datasets-tfidf.csv\")\n",
|
|
"labels_data = pd.read_csv(\"data-analisis/datasets-label.csv\")\n",
|
|
"\n",
|
|
"# # Check for null values in both datasets\n",
|
|
"# print(\"Null values in tfidf_data:\")\n",
|
|
"# print(tfidf_data.isna().sum())\n",
|
|
"# print(\"\\nNull values in labels_data:\")\n",
|
|
"# print(labels_data.isna().sum())\n",
|
|
"\n",
|
|
"# Drop rows with null values in labels_data\n",
|
|
"labels_data = labels_data.dropna(subset=['label'])\n",
|
|
"\n",
|
|
"# Merge the TF-IDF features with the labels\n",
|
|
"data = pd.concat([tfidf_data, labels_data['label']], axis=1)\n",
|
|
"data.to_csv('data-analisis/datasets-balance.csv', index=False)\n",
|
|
"print(data.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.read_csv('data-analisis/datasets-balance.csv')\n",
|
|
"df['label'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"# Import library yang diperlukan\n",
|
|
"import pandas as pd\n",
|
|
"from imblearn.over_sampling import SMOTE\n",
|
|
"\n",
|
|
"# Membaca dataset yang sudah digabungkan\n",
|
|
"df_combined = pd.read_csv('data-analisis/datasets-balance.csv')\n",
|
|
"\n",
|
|
"# Memisahkan fitur dan label\n",
|
|
"X = df_combined.drop(columns=['label']) # Menghapus kolom label\n",
|
|
"y = df_combined['label'] # Mengambil kolom label\n",
|
|
"\n",
|
|
"# Menggunakan SMOTE untuk melakukan data balancing\n",
|
|
"smote = SMOTE(random_state=42)\n",
|
|
"X_resampled, y_resampled = smote.fit_resample(X, y)\n",
|
|
"\n",
|
|
"# Menampilkan jumlah kelas setelah balancing\n",
|
|
"print(\"Jumlah kelas sebelum SMOTE:\")\n",
|
|
"print(y.value_counts())\n",
|
|
"print(\"\\nJumlah kelas setelah SMOTE:\")\n",
|
|
"print(y_resampled.value_counts())\n",
|
|
"\n",
|
|
"# Menyimpan dataset yang sudah di-balance ke file CSV\n",
|
|
"balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['label'])], axis=1)\n",
|
|
"balanced_df.to_csv('data-analisis/datasets-balanced.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Untuk tahapan modeling, dilakukang pada notebook berbeda"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# eksport Data Untuk Dashboard"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from nltk.tokenize import word_tokenize\n",
|
|
"from collections import Counter\n",
|
|
"\n",
|
|
"df=pd.read_csv('REAL-DATA/datasets-clean.csv')\n",
|
|
"\n",
|
|
"def create_word_count_table(df, text_column):\n",
|
|
" # Tokenizing the text data\n",
|
|
" df['tokens'] = df[text_column].apply(word_tokenize)\n",
|
|
" \n",
|
|
" # Flatten the list of tokens and count the occurrences of each word\n",
|
|
" all_tokens = [token for sublist in df['tokens'] for token in sublist]\n",
|
|
" word_counts = Counter(all_tokens)\n",
|
|
" \n",
|
|
" # Convert the word counts to a DataFrame\n",
|
|
" word_count_df = pd.DataFrame(word_counts.items(), columns=['word', 'count'])\n",
|
|
" \n",
|
|
" return word_count_df\n",
|
|
"\n",
|
|
"word_tokenize_df = create_word_count_table(df, 'full_text')\n",
|
|
"word_tokenize_df.to_csv('word_count_result.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv('word_count_result.csv')\n",
|
|
"\n",
|
|
"df.isnull().sum()\n",
|
|
"\n",
|
|
"# Mencari baris yang memiliki nilai NaN atau kosong\n",
|
|
"df[df.isnull().any(axis=1)]\n",
|
|
"\n",
|
|
"# Menampilkan baris yang memiliki nilai NaN atau kosong\n",
|
|
"# print(rows_with_nan)\n",
|
|
"\n",
|
|
"df.dropna(inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv('word_count_result.csv')\n",
|
|
"\n",
|
|
"positive_lexicon = pd.read_csv('InSet/positive.tsv', sep='\\t', header=None)\n",
|
|
"negative_lexicon = pd.read_csv('InSet/negative.tsv', sep='\\t', header=None)\n",
|
|
"\n",
|
|
"# Gabungkan lexicon positif dan negatif\n",
|
|
"positive_lexicon.columns = ['kata', 'polaritas']\n",
|
|
"negative_lexicon.columns = ['kata', 'polaritas']\n",
|
|
"\n",
|
|
"# Pastikan kolom polaritas bertipe numerik\n",
|
|
"positive_lexicon['polaritas'] = pd.to_numeric(positive_lexicon['polaritas'], errors='coerce')\n",
|
|
"negative_lexicon['polaritas'] = pd.to_numeric(negative_lexicon['polaritas'], errors='coerce')\n",
|
|
"\n",
|
|
"lexicon = pd.concat([positive_lexicon, negative_lexicon])\n",
|
|
"\n",
|
|
"# Konversi lexicon ke dictionary untuk lookup cepat\n",
|
|
"lexicon_dict = dict(zip(lexicon['kata'], lexicon['polaritas']))\n",
|
|
"\n",
|
|
"# Fungsi untuk memberikan skor pada teks berdasarkan kamus lexicon\n",
|
|
"def label(tweet, lexicon_dict):\n",
|
|
" words = tweet.split() # Pisahkan tweet menjadi kata-kata\n",
|
|
" sentiment_score = 0 # Inisialisasi skor sentimen\n",
|
|
"\n",
|
|
" # Hitung skor sentimen berdasarkan kata-kata dalam lexicon\n",
|
|
" for word in words:\n",
|
|
" sentiment = lexicon_dict.get(word, 0) # Ambil polaritas dari dictionary, default 0 jika tidak ditemukan\n",
|
|
" sentiment_score += sentiment\n",
|
|
"\n",
|
|
" # Berikan label berdasarkan skor polaritas total\n",
|
|
" if sentiment_score > 0:\n",
|
|
" return 'positif', sentiment_score\n",
|
|
" elif sentiment_score < 0:\n",
|
|
" return 'negatif', sentiment_score\n",
|
|
" else:\n",
|
|
" return 'netral', sentiment_score\n",
|
|
"\n",
|
|
"# Handle NaN values in the 'word' column\n",
|
|
"df['word'].fillna('', inplace=True)\n",
|
|
"\n",
|
|
"df['label', 'score'] = df['word'].apply(lambda x: pd.Series(label(x, lexicon_dict)))\n",
|
|
"df.to_csv('word_count_labeled.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"df = pd.read_csv('word_count_labeled.csv')\n",
|
|
"\n",
|
|
"df['label'].value_counts()\n",
|
|
"\n",
|
|
"df.groupby('label').apply(lambda x: x.loc[x['count'].idxmax()])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import pandas as pd\n",
|
|
"# from wordcloud import WordCloud, get_single_color_func\n",
|
|
"# import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"# df = pd.read_csv('word_count_labeled.csv')\n",
|
|
"\n",
|
|
"# # Menampilkan jumlah label\n",
|
|
"# # print(df['label'].value_counts())\n",
|
|
"\n",
|
|
"# # Fungsi untuk membuat dan menampilkan Word Cloud dengan warna berdasarkan label\n",
|
|
"# def plot_word_cloud(label, color):\n",
|
|
"# words = df[df['label'] == label].set_index('word')['count'].to_dict()\n",
|
|
"# wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=get_single_color_func(color)).generate_from_frequencies(words)\n",
|
|
" \n",
|
|
"# plt.figure(figsize=(10, 5))\n",
|
|
"# plt.imshow(wordcloud, interpolation='bilinear')\n",
|
|
"# plt.title(f'Word Cloud for {label} words')\n",
|
|
"# plt.axis('off')\n",
|
|
"# plt.show()\n",
|
|
"\n",
|
|
"# # Menampilkan Word Cloud untuk setiap label dengan warna yang sesuai\n",
|
|
"# label_colors = {\n",
|
|
"# 'positif': 'green',\n",
|
|
"# 'negatif': 'red',\n",
|
|
"# 'netral': 'gray'\n",
|
|
"# }\n",
|
|
"\n",
|
|
"# for label in df['label'].unique():\n",
|
|
"# plot_word_cloud(label, label_colors[label])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": []
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"eval_svm = pd.read_csv('HASIL-RISET/evaluation_results_SVM-new.csv')\n",
|
|
"eval_nb = pd.read_csv('HASIL-RISET/evaluation_results_nb-new.csv')\n",
|
|
"eval_knn = pd.read_csv('HASIL-RISET/evaluation_results_knn-new.csv')\n",
|
|
"\n",
|
|
"# penggabungan data evaluation\n",
|
|
"# Menambahkan kolom 'model' ke setiap DataFrame\n",
|
|
"eval_svm['model'] = 'svm'\n",
|
|
"eval_nb['model'] = 'nb'\n",
|
|
"eval_knn['model'] = 'knn'\n",
|
|
"\n",
|
|
"# Mengatur ulang kolom agar 'model' menjadi kolom pertama\n",
|
|
"svm = eval_svm[['model'] + [col for col in eval_svm.columns if col != 'model']]\n",
|
|
"nb = eval_nb[['model'] + [col for col in eval_nb.columns if col != 'model']]\n",
|
|
"knn = eval_knn[['model'] + [col for col in eval_knn.columns if col != 'model']]\n",
|
|
"\n",
|
|
"# Menggabungkan semua DataFrame\n",
|
|
"combined_df = pd.concat([svm, nb, knn], axis=0, ignore_index=True)\n",
|
|
"\n",
|
|
"# Menampilkan hasil\n",
|
|
"print(combined_df.head())\n",
|
|
"\n",
|
|
"combined_df.to_csv('HASIL-RISET/evaluation_results_combine.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|