{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pengumpulan Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#@title Twitter Auth Token\n", "\n", "twitter_auth_token = 'ccb0a6dc47226f7ddc27a7a0975107094c8899e3'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install Node.js\n", "%sudo apt-get update\n", "%sudo apt-get install -y ca-certificates curl gnupg\n", "%sudo mkdir -p /etc/apt/keyrings\n", "%curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg\n", "\n", "%NODE_MAJOR=20 && echo \"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main\" | sudo tee /etc/apt/sources.list.d/nodesource.list\n", "\n", "%sudo apt-get update\n", "%sudo apt-get install nodejs -y\n", "\n", "%node -v" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Crawl Data\n", "import os\n", "\n", "# List keyword untuk digunakan\n", "keywords = [\n", " 'gaji gen z lang:id until:2024-12-31 since:2020-01-01',\n", " 'kesehatan mental generasi z lang:id until:2024-12-31 since:2020-01-01'\n", " 'finansial gen z lang:id until:2024-12-31 since:2020-01-01'\n", "]\n", "\n", "# Filename untuk setiap keyword\n", "filenames = [\n", " 'dataset-gaji-gen-z.csv',\n", " 'dataset_kesehatan_mental_generasi_z.csv',\n", " 'dataset_finansial_gen_z.csv'\n", "]\n", "\n", "# Limit data yang ingin dikumpulkan\n", "limit = 50000\n", "\n", "# Lakukan crawling untuk setiap keyword\n", "for keyword, filename in zip(keywords, filenames):\n", " os.system(f'npx -y tweet-harvest@2.6.1 -o \"{filename}\" -s \"{keyword}\" --tab \"LATEST\" -l {limit} --token {twitter_auth_token}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pra-pemrosesan data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Membaca dataset\n", "df1 = pd.read_csv('REAL-DATA/REAL-dataset_finansial_gen_z.csv')\n", "df2 = pd.read_csv('REAL-DATA/REAL-dataset_gaji_gen_z.csv')\n", "df3 = pd.read_csv('REAL-DATA/REAL-dataset_kesehatan_mental_generasi_z.csv')\n", "df4 = pd.read_csv('data matang/dataset_finansial_gen_z.csv')\n", "df5 = pd.read_csv('data matang/dataset_gaji_gen_z.csv')\n", "df6 = pd.read_csv('data matang/dataset_kesehatan_mental_generasi_z-TGL.csv')\n", "\n", "# Menambahkan kolom keyword\n", "df1['keyword'] = 'finansial gen z'\n", "df2['keyword'] = 'gaji gen z'\n", "df3['keyword'] = 'kesehatan mental generasi z'\n", "df4['keyword'] = 'finansial gen z'\n", "df5['keyword'] = 'gaji gen z'\n", "df6['keyword'] = 'kesehatan mental generasi z'\n", "\n", "# Menggabungkan data set berdasarkan keyword\n", "df_finansial = pd.concat([df1, df4], ignore_index=True)\n", "df_gaji = pd.concat([df2, df5], ignore_index=True)\n", "df_kesehatan_mental = pd.concat([df3, df6], ignore_index=True)\n", "\n", "# Menggabungkan semua dataframe menjadi satu\n", "merge_df = pd.concat([df_finansial, df_gaji, df_kesehatan_mental], ignore_index=True)\n", "\n", "# Menyimpan ke CSV\n", "merge_df.to_csv('data-analisis/datasets-kotor.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df = pd.read_csv('data-analisis/datasets-kotor.csv')\n", "df.count()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df = pd.read_csv('data-analisis/datasets-kotor.csv')\n", "df.count()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#drop column is not needed\n", "df.drop(df.columns[[0,2,4,5,6,7,8,9,10,11,12,11,13,14]], axis=1, inplace=True)\n", "\n", "df.to_csv('data-analisis/datasets.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df.count()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# drop row have duplicate value\n", "df.drop_duplicates(subset=['full_text'], inplace=True)\n", "\n", "#drop row have missing value\n", "df.dropna(subset=['full_text'], inplace=True)\n", "\n", "df.to_csv('data-analisis/datasets.csv', index=False)\n", "df.count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cleansing data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.read_csv('data-analisis/datasets.csv')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# cleansing data\n", "import re\n", "import string\n", "\n", "def clean_text(text):\n", " text = re.sub(r'@[A-Za-z0-9]+', '', text) # delete mention\n", " text = re.sub(r'#', '', text) # delete hastag\n", " text = re.sub(r'RT[\\s]+', '', text) # delate RT\n", " text = re.sub(r'https?:\\/\\/\\S+', '', text) # delete hyperlink\n", " text = re.sub(r'\\n', '', text) # delete new line\n", " text = re.sub(r'\\d+', '', text) # delete number\n", " text = re.sub(r'[^A-Za-z ]+', '', text) # delete non alphabet\n", " \n", " text = text.replace('…', '') # delete ellipsis\n", " text = text.translate(str.maketrans('', '', string.punctuation)) # delete punctuation\n", " text = text.strip() # delete space\n", " return text\n", "\n", "df ['cleanning_text'] = df['full_text'].apply(clean_text)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## case folding" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# case folding\n", "def case_folding(text):\n", " text = text.lower() # change to lower case\n", " return text\n", "\n", "df['case_folding'] = df['cleanning_text'].apply(case_folding)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## convert slang word" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# convert slang word\n", "slang_words = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/refs/heads/master/colloquial-indonesian-lexicon.csv')\n", "slang_words_dict = dict(zip(slang_words['slang'], slang_words['formal']))\n", "\n", "# Fungsi untuk mengonversi slang word\n", "def convert_slang_word(text):\n", " return ' '.join([slang_words_dict.get(word, word) for word in text.split()])\n", "\n", "# Menerapkan fungsi ke kolom 'case_folding'\n", "df['convert_slang_word'] = df['case_folding'].apply(convert_slang_word)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stop word" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# Stop word\n", "# from nltk.corpus import stopwords\n", "\n", "# def filtering(text):\n", "# stop_words = set(stopwords.words('indonesian'))\n", "# word_tokens = text.split()\n", "# text = [word for word in word_tokens if word not in stop_words]\n", "# text = ' '.join(text)\n", "# return text\n", "\n", "# df['filtering'] = df['convert_slang_word'].apply(filtering)\n", "# df\n", "\n", "# Stop word\n", "from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory\n", "\n", "def filtering(text):\n", " factory = StopWordRemoverFactory()\n", " stop_words = set(factory.get_stop_words())\n", " word_tokens = text.split()\n", " text = [word for word in word_tokens if word not in stop_words]\n", " text = ' '.join(text)\n", " return text\n", "\n", "df['filtering'] = df['convert_slang_word'].apply(filtering)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tokenizing" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "# nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# tokenizing\n", "from nltk.tokenize import word_tokenize\n", "\n", "def tokenizing(text):\n", " text = word_tokenize(text)\n", " return text\n", "\n", "df['tokenizing'] = df['filtering'].apply(tokenizing)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## stemming" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# stemming\n", "from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n", "\n", "# create stemmer\n", "factory = StemmerFactory()\n", "stemmer = factory.create_stemmer()\n", "\n", "def stem_text(tokens):\n", " text = ' '.join(tokens) # Join the tokens back into a single string\n", " return stemmer.stem(text)\n", "\n", "df['stemming'] = df['tokenizing'].apply(stem_text)\n", "df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "#save to csv\n", "df.to_csv('data-analisis/datasets.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "df.drop(df.columns[[1, 2, 3, 4, 5, 6]], axis=1, inplace=True)\n", "\n", "# Rename column 'stemming' to 'full_text'\n", "df.rename(columns={'stemming': 'full_text'}, inplace=True)\n", "\n", "df.to_csv('data-analisis/datasets-clean.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pelabelan Data" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "import pandas as pd\n", "df = pd.read_csv('data-analisis/datasets-clean.csv')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "import pandas as pd\n", "\n", "#unduh kamus inset lexicon positif dan negatif\n", "positive_url = \"https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv\"\n", "negative_url = \"https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv\"\n", "\n", "positive_lexicon = set(pd.read_csv(positive_url, sep='\\t', header=None)[0])\n", "negative_lexicon = set(pd.read_csv(negative_url, sep='\\t', header=None)[0])\n", "\n", "#fungsi menghitung skor sentimen\n", "def determine_sentiment(text):\n", " if isinstance(text, str):\n", " positive_count = sum(1 for word in text.split() if word in positive_lexicon)\n", " negative_count = sum(1 for word in text.split() if word in negative_lexicon)\n", " sentiment_score = positive_count - negative_count\n", " if sentiment_score > 0:\n", " sentiment = 'Positif'\n", " elif sentiment_score < 0:\n", " sentiment = 'Negatif'\n", " else:\n", " sentiment = 'Netral'\n", " return sentiment_score, sentiment\n", " return 0, \"netral\"\n", " \n", "#menerapkan perhitungan ke datasets\n", "df[['score', 'label']] = df['full_text'].apply(lambda x: pd.Series(determine_sentiment(x)))\n", "\n", "df.to_csv('data-analisis/datasets-label.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# Menghitung jumlah label\n", "df['label'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Ekstraksi Fitur" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "import pandas as pd\n", "#Membaca dataset yang sudah diberi label\n", "df = pd.read_csv('data-analisis/datasets-label.csv')\n", "df['label'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "versi yutub" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np\n", "\n", "#menggunakan TFidVectorizer untuk menghitung TF - IDF\n", "tfidf_vectorizer = TfidfVectorizer()\n", "tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_text'])\n", "\n", "#menghitung IDF\n", "term = tfidf_vectorizer.get_feature_names_out()\n", "idf = np.log(tfidf_matrix.shape[0] / (np.count_nonzero(tfidf_matrix.toarray(), axis=0) + 1))\n", "\n", "#membuat data frame untuk menyimpan term dan idf\n", "tfidf_df = pd.DataFrame({'term' : term, 'idf' : idf})\n", "\n", "#tambah ke kolom TF data frame\n", "for i, doc in enumerate(df['full_text']):\n", " tf = tfidf_matrix[i].toarray().flatten()\n", " tfidf_df[f'tf_{i}'] = tf\n", " \n", "tfidf_df.to_csv('REAL-DATA/datasets-tfidfshow.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "versi riset" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "df = pd.read_csv('data-analisis/datasets-label.csv')\n", "# Convert text to vectors using TF-IDF\n", "tfidf_vectorizer = TfidfVectorizer()\n", "x = tfidf_vectorizer.fit_transform(df['full_text'])\n", "\n", "tfidf = x.toarray()\n", "print(tfidf[:1])\n", "\n", "# Convert the array to a DataFrame\n", "tfidf_df = pd.DataFrame(tfidf)\n", "\n", "# Save the DataFrame to a CSV file\n", "tfidf_df.to_csv('data-analisis/datasets-tfidf.csv', index=False)\n", "\n", "# save to pickle\n", "# tfidf_df.to_pickle('REAL-DATA/datasets-tfidf.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Balancing" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df.isna().sum() # to check null values" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "import pandas as pd\n", "\n", "tfidf_data = pd.read_csv(\"data-analisis/datasets-tfidf.csv\")\n", "labels_data = pd.read_csv(\"data-analisis/datasets-label.csv\")\n", "\n", "# # Check for null values in both datasets\n", "# print(\"Null values in tfidf_data:\")\n", "# print(tfidf_data.isna().sum())\n", "# print(\"\\nNull values in labels_data:\")\n", "# print(labels_data.isna().sum())\n", "\n", "# Drop rows with null values in labels_data\n", "labels_data = labels_data.dropna(subset=['label'])\n", "\n", "# Merge the TF-IDF features with the labels\n", "data = pd.concat([tfidf_data, labels_data['label']], axis=1)\n", "data.to_csv('data-analisis/datasets-balance.csv', index=False)\n", "print(data.head())" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "df = pd.read_csv('data-analisis/datasets-balance.csv')\n", "df['label'].value_counts()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "# Import library yang diperlukan\n", "import pandas as pd\n", "from imblearn.over_sampling import SMOTE\n", "\n", "# Membaca dataset yang sudah digabungkan\n", "df_combined = pd.read_csv('data-analisis/datasets-balance.csv')\n", "\n", "# Memisahkan fitur dan label\n", "X = df_combined.drop(columns=['label']) # Menghapus kolom label\n", "y = df_combined['label'] # Mengambil kolom label\n", "\n", "# Menggunakan SMOTE untuk melakukan data balancing\n", "smote = SMOTE(random_state=42)\n", "X_resampled, y_resampled = smote.fit_resample(X, y)\n", "\n", "# Menampilkan jumlah kelas setelah balancing\n", "print(\"Jumlah kelas sebelum SMOTE:\")\n", "print(y.value_counts())\n", "print(\"\\nJumlah kelas setelah SMOTE:\")\n", "print(y_resampled.value_counts())\n", "\n", "# Menyimpan dataset yang sudah di-balance ke file CSV\n", "balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['label'])], axis=1)\n", "balanced_df.to_csv('data-analisis/datasets-balanced.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Untuk tahapan modeling, dilakukang pada notebook berbeda" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# eksport Data Untuk Dashboard" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from nltk.tokenize import word_tokenize\n", "from collections import Counter\n", "\n", "df=pd.read_csv('REAL-DATA/datasets-clean.csv')\n", "\n", "def create_word_count_table(df, text_column):\n", " # Tokenizing the text data\n", " df['tokens'] = df[text_column].apply(word_tokenize)\n", " \n", " # Flatten the list of tokens and count the occurrences of each word\n", " all_tokens = [token for sublist in df['tokens'] for token in sublist]\n", " word_counts = Counter(all_tokens)\n", " \n", " # Convert the word counts to a DataFrame\n", " word_count_df = pd.DataFrame(word_counts.items(), columns=['word', 'count'])\n", " \n", " return word_count_df\n", "\n", "word_tokenize_df = create_word_count_table(df, 'full_text')\n", "word_tokenize_df.to_csv('word_count_result.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('word_count_result.csv')\n", "\n", "df.isnull().sum()\n", "\n", "# Mencari baris yang memiliki nilai NaN atau kosong\n", "df[df.isnull().any(axis=1)]\n", "\n", "# Menampilkan baris yang memiliki nilai NaN atau kosong\n", "# print(rows_with_nan)\n", "\n", "df.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('word_count_result.csv')\n", "\n", "positive_lexicon = pd.read_csv('InSet/positive.tsv', sep='\\t', header=None)\n", "negative_lexicon = pd.read_csv('InSet/negative.tsv', sep='\\t', header=None)\n", "\n", "# Gabungkan lexicon positif dan negatif\n", "positive_lexicon.columns = ['kata', 'polaritas']\n", "negative_lexicon.columns = ['kata', 'polaritas']\n", "\n", "# Pastikan kolom polaritas bertipe numerik\n", "positive_lexicon['polaritas'] = pd.to_numeric(positive_lexicon['polaritas'], errors='coerce')\n", "negative_lexicon['polaritas'] = pd.to_numeric(negative_lexicon['polaritas'], errors='coerce')\n", "\n", "lexicon = pd.concat([positive_lexicon, negative_lexicon])\n", "\n", "# Konversi lexicon ke dictionary untuk lookup cepat\n", "lexicon_dict = dict(zip(lexicon['kata'], lexicon['polaritas']))\n", "\n", "# Fungsi untuk memberikan skor pada teks berdasarkan kamus lexicon\n", "def label(tweet, lexicon_dict):\n", " words = tweet.split() # Pisahkan tweet menjadi kata-kata\n", " sentiment_score = 0 # Inisialisasi skor sentimen\n", "\n", " # Hitung skor sentimen berdasarkan kata-kata dalam lexicon\n", " for word in words:\n", " sentiment = lexicon_dict.get(word, 0) # Ambil polaritas dari dictionary, default 0 jika tidak ditemukan\n", " sentiment_score += sentiment\n", "\n", " # Berikan label berdasarkan skor polaritas total\n", " if sentiment_score > 0:\n", " return 'positif', sentiment_score\n", " elif sentiment_score < 0:\n", " return 'negatif', sentiment_score\n", " else:\n", " return 'netral', sentiment_score\n", "\n", "# Handle NaN values in the 'word' column\n", "df['word'].fillna('', inplace=True)\n", "\n", "df['label', 'score'] = df['word'].apply(lambda x: pd.Series(label(x, lexicon_dict)))\n", "df.to_csv('word_count_labeled.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('word_count_labeled.csv')\n", "\n", "df['label'].value_counts()\n", "\n", "df.groupby('label').apply(lambda x: x.loc[x['count'].idxmax()])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import pandas as pd\n", "# from wordcloud import WordCloud, get_single_color_func\n", "# import matplotlib.pyplot as plt\n", "\n", "# df = pd.read_csv('word_count_labeled.csv')\n", "\n", "# # Menampilkan jumlah label\n", "# # print(df['label'].value_counts())\n", "\n", "# # Fungsi untuk membuat dan menampilkan Word Cloud dengan warna berdasarkan label\n", "# def plot_word_cloud(label, color):\n", "# words = df[df['label'] == label].set_index('word')['count'].to_dict()\n", "# wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=get_single_color_func(color)).generate_from_frequencies(words)\n", " \n", "# plt.figure(figsize=(10, 5))\n", "# plt.imshow(wordcloud, interpolation='bilinear')\n", "# plt.title(f'Word Cloud for {label} words')\n", "# plt.axis('off')\n", "# plt.show()\n", "\n", "# # Menampilkan Word Cloud untuk setiap label dengan warna yang sesuai\n", "# label_colors = {\n", "# 'positif': 'green',\n", "# 'negatif': 'red',\n", "# 'netral': 'gray'\n", "# }\n", "\n", "# for label in df['label'].unique():\n", "# plot_word_cloud(label, label_colors[label])" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [] } ], "source": [ "import pandas as pd\n", "\n", "eval_svm = pd.read_csv('HASIL-RISET/evaluation_results_SVM-new.csv')\n", "eval_nb = pd.read_csv('HASIL-RISET/evaluation_results_nb-new.csv')\n", "eval_knn = pd.read_csv('HASIL-RISET/evaluation_results_knn-new.csv')\n", "\n", "# penggabungan data evaluation\n", "# Menambahkan kolom 'model' ke setiap DataFrame\n", "eval_svm['model'] = 'svm'\n", "eval_nb['model'] = 'nb'\n", "eval_knn['model'] = 'knn'\n", "\n", "# Mengatur ulang kolom agar 'model' menjadi kolom pertama\n", "svm = eval_svm[['model'] + [col for col in eval_svm.columns if col != 'model']]\n", "nb = eval_nb[['model'] + [col for col in eval_nb.columns if col != 'model']]\n", "knn = eval_knn[['model'] + [col for col in eval_knn.columns if col != 'model']]\n", "\n", "# Menggabungkan semua DataFrame\n", "combined_df = pd.concat([svm, nb, knn], axis=0, ignore_index=True)\n", "\n", "# Menampilkan hasil\n", "print(combined_df.head())\n", "\n", "combined_df.to_csv('HASIL-RISET/evaluation_results_combine.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }