MIF_E31222492/Pengumpulan Data/Tugas Akhir.ipynb

1024 lines
27 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pengumpulan Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#@title Twitter Auth Token\n",
"\n",
"twitter_auth_token = 'ccb0a6dc47226f7ddc27a7a0975107094c8899e3'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install Node.js\n",
"%sudo apt-get update\n",
"%sudo apt-get install -y ca-certificates curl gnupg\n",
"%sudo mkdir -p /etc/apt/keyrings\n",
"%curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg\n",
"\n",
"%NODE_MAJOR=20 && echo \"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main\" | sudo tee /etc/apt/sources.list.d/nodesource.list\n",
"\n",
"%sudo apt-get update\n",
"%sudo apt-get install nodejs -y\n",
"\n",
"%node -v"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Crawl Data\n",
"import os\n",
"\n",
"# List keyword untuk digunakan\n",
"keywords = [\n",
" 'gaji gen z lang:id until:2024-12-31 since:2020-01-01',\n",
" 'kesehatan mental generasi z lang:id until:2024-12-31 since:2020-01-01'\n",
" 'finansial gen z lang:id until:2024-12-31 since:2020-01-01'\n",
"]\n",
"\n",
"# Filename untuk setiap keyword\n",
"filenames = [\n",
" 'dataset-gaji-gen-z.csv',\n",
" 'dataset_kesehatan_mental_generasi_z.csv',\n",
" 'dataset_finansial_gen_z.csv'\n",
"]\n",
"\n",
"# Limit data yang ingin dikumpulkan\n",
"limit = 50000\n",
"\n",
"# Lakukan crawling untuk setiap keyword\n",
"for keyword, filename in zip(keywords, filenames):\n",
" os.system(f'npx -y tweet-harvest@2.6.1 -o \"{filename}\" -s \"{keyword}\" --tab \"LATEST\" -l {limit} --token {twitter_auth_token}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pra-pemrosesan data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Membaca dataset\n",
"df1 = pd.read_csv('REAL-DATA/REAL-dataset_finansial_gen_z.csv')\n",
"df2 = pd.read_csv('REAL-DATA/REAL-dataset_gaji_gen_z.csv')\n",
"df3 = pd.read_csv('REAL-DATA/REAL-dataset_kesehatan_mental_generasi_z.csv')\n",
"df4 = pd.read_csv('data matang/dataset_finansial_gen_z.csv')\n",
"df5 = pd.read_csv('data matang/dataset_gaji_gen_z.csv')\n",
"df6 = pd.read_csv('data matang/dataset_kesehatan_mental_generasi_z-TGL.csv')\n",
"\n",
"# Menambahkan kolom keyword\n",
"df1['keyword'] = 'finansial gen z'\n",
"df2['keyword'] = 'gaji gen z'\n",
"df3['keyword'] = 'kesehatan mental generasi z'\n",
"df4['keyword'] = 'finansial gen z'\n",
"df5['keyword'] = 'gaji gen z'\n",
"df6['keyword'] = 'kesehatan mental generasi z'\n",
"\n",
"# Menggabungkan data set berdasarkan keyword\n",
"df_finansial = pd.concat([df1, df4], ignore_index=True)\n",
"df_gaji = pd.concat([df2, df5], ignore_index=True)\n",
"df_kesehatan_mental = pd.concat([df3, df6], ignore_index=True)\n",
"\n",
"# Menggabungkan semua dataframe menjadi satu\n",
"merge_df = pd.concat([df_finansial, df_gaji, df_kesehatan_mental], ignore_index=True)\n",
"\n",
"# Menyimpan ke CSV\n",
"merge_df.to_csv('data-analisis/datasets-kotor.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df = pd.read_csv('data-analisis/datasets-kotor.csv')\n",
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df = pd.read_csv('data-analisis/datasets-kotor.csv')\n",
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#drop column is not needed\n",
"df.drop(df.columns[[0,2,4,5,6,7,8,9,10,11,12,11,13,14]], axis=1, inplace=True)\n",
"\n",
"df.to_csv('data-analisis/datasets.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# drop row have duplicate value\n",
"df.drop_duplicates(subset=['full_text'], inplace=True)\n",
"\n",
"#drop row have missing value\n",
"df.dropna(subset=['full_text'], inplace=True)\n",
"\n",
"df.to_csv('data-analisis/datasets.csv', index=False)\n",
"df.count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## cleansing data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('data-analisis/datasets.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# cleansing data\n",
"import re\n",
"import string\n",
"\n",
"def clean_text(text):\n",
" text = re.sub(r'@[A-Za-z0-9]+', '', text) # delete mention\n",
" text = re.sub(r'#', '', text) # delete hastag\n",
" text = re.sub(r'RT[\\s]+', '', text) # delate RT\n",
" text = re.sub(r'https?:\\/\\/\\S+', '', text) # delete hyperlink\n",
" text = re.sub(r'\\n', '', text) # delete new line\n",
" text = re.sub(r'\\d+', '', text) # delete number\n",
" text = re.sub(r'[^A-Za-z ]+', '', text) # delete non alphabet\n",
" \n",
" text = text.replace('…', '') # delete ellipsis\n",
" text = text.translate(str.maketrans('', '', string.punctuation)) # delete punctuation\n",
" text = text.strip() # delete space\n",
" return text\n",
"\n",
"df ['cleanning_text'] = df['full_text'].apply(clean_text)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## case folding"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# case folding\n",
"def case_folding(text):\n",
" text = text.lower() # change to lower case\n",
" return text\n",
"\n",
"df['case_folding'] = df['cleanning_text'].apply(case_folding)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## convert slang word"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# convert slang word\n",
"slang_words = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/refs/heads/master/colloquial-indonesian-lexicon.csv')\n",
"slang_words_dict = dict(zip(slang_words['slang'], slang_words['formal']))\n",
"\n",
"# Fungsi untuk mengonversi slang word\n",
"def convert_slang_word(text):\n",
" return ' '.join([slang_words_dict.get(word, word) for word in text.split()])\n",
"\n",
"# Menerapkan fungsi ke kolom 'case_folding'\n",
"df['convert_slang_word'] = df['case_folding'].apply(convert_slang_word)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop word"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# Stop word\n",
"# from nltk.corpus import stopwords\n",
"\n",
"# def filtering(text):\n",
"# stop_words = set(stopwords.words('indonesian'))\n",
"# word_tokens = text.split()\n",
"# text = [word for word in word_tokens if word not in stop_words]\n",
"# text = ' '.join(text)\n",
"# return text\n",
"\n",
"# df['filtering'] = df['convert_slang_word'].apply(filtering)\n",
"# df\n",
"\n",
"# Stop word\n",
"from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory\n",
"\n",
"def filtering(text):\n",
" factory = StopWordRemoverFactory()\n",
" stop_words = set(factory.get_stop_words())\n",
" word_tokens = text.split()\n",
" text = [word for word in word_tokens if word not in stop_words]\n",
" text = ' '.join(text)\n",
" return text\n",
"\n",
"df['filtering'] = df['convert_slang_word'].apply(filtering)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## tokenizing"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"# nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# tokenizing\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"def tokenizing(text):\n",
" text = word_tokenize(text)\n",
" return text\n",
"\n",
"df['tokenizing'] = df['filtering'].apply(tokenizing)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## stemming"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# stemming\n",
"from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n",
"\n",
"# create stemmer\n",
"factory = StemmerFactory()\n",
"stemmer = factory.create_stemmer()\n",
"\n",
"def stem_text(tokens):\n",
" text = ' '.join(tokens) # Join the tokens back into a single string\n",
" return stemmer.stem(text)\n",
"\n",
"df['stemming'] = df['tokenizing'].apply(stem_text)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"#save to csv\n",
"df.to_csv('data-analisis/datasets.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df.drop(df.columns[[1, 2, 3, 4, 5, 6]], axis=1, inplace=True)\n",
"\n",
"# Rename column 'stemming' to 'full_text'\n",
"df.rename(columns={'stemming': 'full_text'}, inplace=True)\n",
"\n",
"df.to_csv('data-analisis/datasets-clean.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pelabelan Data"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('data-analisis/datasets-clean.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"import pandas as pd\n",
"\n",
"#unduh kamus inset lexicon positif dan negatif\n",
"positive_url = \"https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv\"\n",
"negative_url = \"https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv\"\n",
"\n",
"positive_lexicon = set(pd.read_csv(positive_url, sep='\\t', header=None)[0])\n",
"negative_lexicon = set(pd.read_csv(negative_url, sep='\\t', header=None)[0])\n",
"\n",
"#fungsi menghitung skor sentimen\n",
"def determine_sentiment(text):\n",
" if isinstance(text, str):\n",
" positive_count = sum(1 for word in text.split() if word in positive_lexicon)\n",
" negative_count = sum(1 for word in text.split() if word in negative_lexicon)\n",
" sentiment_score = positive_count - negative_count\n",
" if sentiment_score > 0:\n",
" sentiment = 'Positif'\n",
" elif sentiment_score < 0:\n",
" sentiment = 'Negatif'\n",
" else:\n",
" sentiment = 'Netral'\n",
" return sentiment_score, sentiment\n",
" return 0, \"netral\"\n",
" \n",
"#menerapkan perhitungan ke datasets\n",
"df[['score', 'label']] = df['full_text'].apply(lambda x: pd.Series(determine_sentiment(x)))\n",
"\n",
"df.to_csv('data-analisis/datasets-label.csv', index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# Menghitung jumlah label\n",
"df['label'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ekstraksi Fitur"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"import pandas as pd\n",
"#Membaca dataset yang sudah diberi label\n",
"df = pd.read_csv('data-analisis/datasets-label.csv')\n",
"df['label'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"versi yutub"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"\n",
"#menggunakan TFidVectorizer untuk menghitung TF - IDF\n",
"tfidf_vectorizer = TfidfVectorizer()\n",
"tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_text'])\n",
"\n",
"#menghitung IDF\n",
"term = tfidf_vectorizer.get_feature_names_out()\n",
"idf = np.log(tfidf_matrix.shape[0] / (np.count_nonzero(tfidf_matrix.toarray(), axis=0) + 1))\n",
"\n",
"#membuat data frame untuk menyimpan term dan idf\n",
"tfidf_df = pd.DataFrame({'term' : term, 'idf' : idf})\n",
"\n",
"#tambah ke kolom TF data frame\n",
"for i, doc in enumerate(df['full_text']):\n",
" tf = tfidf_matrix[i].toarray().flatten()\n",
" tfidf_df[f'tf_{i}'] = tf\n",
" \n",
"tfidf_df.to_csv('REAL-DATA/datasets-tfidfshow.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"versi riset"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"df = pd.read_csv('data-analisis/datasets-label.csv')\n",
"# Convert text to vectors using TF-IDF\n",
"tfidf_vectorizer = TfidfVectorizer()\n",
"x = tfidf_vectorizer.fit_transform(df['full_text'])\n",
"\n",
"tfidf = x.toarray()\n",
"print(tfidf[:1])\n",
"\n",
"# Convert the array to a DataFrame\n",
"tfidf_df = pd.DataFrame(tfidf)\n",
"\n",
"# Save the DataFrame to a CSV file\n",
"tfidf_df.to_csv('data-analisis/datasets-tfidf.csv', index=False)\n",
"\n",
"# save to pickle\n",
"# tfidf_df.to_pickle('REAL-DATA/datasets-tfidf.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Balancing"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df.isna().sum() # to check null values"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"import pandas as pd\n",
"\n",
"tfidf_data = pd.read_csv(\"data-analisis/datasets-tfidf.csv\")\n",
"labels_data = pd.read_csv(\"data-analisis/datasets-label.csv\")\n",
"\n",
"# # Check for null values in both datasets\n",
"# print(\"Null values in tfidf_data:\")\n",
"# print(tfidf_data.isna().sum())\n",
"# print(\"\\nNull values in labels_data:\")\n",
"# print(labels_data.isna().sum())\n",
"\n",
"# Drop rows with null values in labels_data\n",
"labels_data = labels_data.dropna(subset=['label'])\n",
"\n",
"# Merge the TF-IDF features with the labels\n",
"data = pd.concat([tfidf_data, labels_data['label']], axis=1)\n",
"data.to_csv('data-analisis/datasets-balance.csv', index=False)\n",
"print(data.head())"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"df = pd.read_csv('data-analisis/datasets-balance.csv')\n",
"df['label'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"# Import library yang diperlukan\n",
"import pandas as pd\n",
"from imblearn.over_sampling import SMOTE\n",
"\n",
"# Membaca dataset yang sudah digabungkan\n",
"df_combined = pd.read_csv('data-analisis/datasets-balance.csv')\n",
"\n",
"# Memisahkan fitur dan label\n",
"X = df_combined.drop(columns=['label']) # Menghapus kolom label\n",
"y = df_combined['label'] # Mengambil kolom label\n",
"\n",
"# Menggunakan SMOTE untuk melakukan data balancing\n",
"smote = SMOTE(random_state=42)\n",
"X_resampled, y_resampled = smote.fit_resample(X, y)\n",
"\n",
"# Menampilkan jumlah kelas setelah balancing\n",
"print(\"Jumlah kelas sebelum SMOTE:\")\n",
"print(y.value_counts())\n",
"print(\"\\nJumlah kelas setelah SMOTE:\")\n",
"print(y_resampled.value_counts())\n",
"\n",
"# Menyimpan dataset yang sudah di-balance ke file CSV\n",
"balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['label'])], axis=1)\n",
"balanced_df.to_csv('data-analisis/datasets-balanced.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Untuk tahapan modeling, dilakukang pada notebook berbeda"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# eksport Data Untuk Dashboard"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from nltk.tokenize import word_tokenize\n",
"from collections import Counter\n",
"\n",
"df=pd.read_csv('REAL-DATA/datasets-clean.csv')\n",
"\n",
"def create_word_count_table(df, text_column):\n",
" # Tokenizing the text data\n",
" df['tokens'] = df[text_column].apply(word_tokenize)\n",
" \n",
" # Flatten the list of tokens and count the occurrences of each word\n",
" all_tokens = [token for sublist in df['tokens'] for token in sublist]\n",
" word_counts = Counter(all_tokens)\n",
" \n",
" # Convert the word counts to a DataFrame\n",
" word_count_df = pd.DataFrame(word_counts.items(), columns=['word', 'count'])\n",
" \n",
" return word_count_df\n",
"\n",
"word_tokenize_df = create_word_count_table(df, 'full_text')\n",
"word_tokenize_df.to_csv('word_count_result.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('word_count_result.csv')\n",
"\n",
"df.isnull().sum()\n",
"\n",
"# Mencari baris yang memiliki nilai NaN atau kosong\n",
"df[df.isnull().any(axis=1)]\n",
"\n",
"# Menampilkan baris yang memiliki nilai NaN atau kosong\n",
"# print(rows_with_nan)\n",
"\n",
"df.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('word_count_result.csv')\n",
"\n",
"positive_lexicon = pd.read_csv('InSet/positive.tsv', sep='\\t', header=None)\n",
"negative_lexicon = pd.read_csv('InSet/negative.tsv', sep='\\t', header=None)\n",
"\n",
"# Gabungkan lexicon positif dan negatif\n",
"positive_lexicon.columns = ['kata', 'polaritas']\n",
"negative_lexicon.columns = ['kata', 'polaritas']\n",
"\n",
"# Pastikan kolom polaritas bertipe numerik\n",
"positive_lexicon['polaritas'] = pd.to_numeric(positive_lexicon['polaritas'], errors='coerce')\n",
"negative_lexicon['polaritas'] = pd.to_numeric(negative_lexicon['polaritas'], errors='coerce')\n",
"\n",
"lexicon = pd.concat([positive_lexicon, negative_lexicon])\n",
"\n",
"# Konversi lexicon ke dictionary untuk lookup cepat\n",
"lexicon_dict = dict(zip(lexicon['kata'], lexicon['polaritas']))\n",
"\n",
"# Fungsi untuk memberikan skor pada teks berdasarkan kamus lexicon\n",
"def label(tweet, lexicon_dict):\n",
" words = tweet.split() # Pisahkan tweet menjadi kata-kata\n",
" sentiment_score = 0 # Inisialisasi skor sentimen\n",
"\n",
" # Hitung skor sentimen berdasarkan kata-kata dalam lexicon\n",
" for word in words:\n",
" sentiment = lexicon_dict.get(word, 0) # Ambil polaritas dari dictionary, default 0 jika tidak ditemukan\n",
" sentiment_score += sentiment\n",
"\n",
" # Berikan label berdasarkan skor polaritas total\n",
" if sentiment_score > 0:\n",
" return 'positif', sentiment_score\n",
" elif sentiment_score < 0:\n",
" return 'negatif', sentiment_score\n",
" else:\n",
" return 'netral', sentiment_score\n",
"\n",
"# Handle NaN values in the 'word' column\n",
"df['word'].fillna('', inplace=True)\n",
"\n",
"df['label', 'score'] = df['word'].apply(lambda x: pd.Series(label(x, lexicon_dict)))\n",
"df.to_csv('word_count_labeled.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('word_count_labeled.csv')\n",
"\n",
"df['label'].value_counts()\n",
"\n",
"df.groupby('label').apply(lambda x: x.loc[x['count'].idxmax()])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import pandas as pd\n",
"# from wordcloud import WordCloud, get_single_color_func\n",
"# import matplotlib.pyplot as plt\n",
"\n",
"# df = pd.read_csv('word_count_labeled.csv')\n",
"\n",
"# # Menampilkan jumlah label\n",
"# # print(df['label'].value_counts())\n",
"\n",
"# # Fungsi untuk membuat dan menampilkan Word Cloud dengan warna berdasarkan label\n",
"# def plot_word_cloud(label, color):\n",
"# words = df[df['label'] == label].set_index('word')['count'].to_dict()\n",
"# wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=get_single_color_func(color)).generate_from_frequencies(words)\n",
" \n",
"# plt.figure(figsize=(10, 5))\n",
"# plt.imshow(wordcloud, interpolation='bilinear')\n",
"# plt.title(f'Word Cloud for {label} words')\n",
"# plt.axis('off')\n",
"# plt.show()\n",
"\n",
"# # Menampilkan Word Cloud untuk setiap label dengan warna yang sesuai\n",
"# label_colors = {\n",
"# 'positif': 'green',\n",
"# 'negatif': 'red',\n",
"# 'netral': 'gray'\n",
"# }\n",
"\n",
"# for label in df['label'].unique():\n",
"# plot_word_cloud(label, label_colors[label])"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": []
}
],
"source": [
"import pandas as pd\n",
"\n",
"eval_svm = pd.read_csv('HASIL-RISET/evaluation_results_SVM-new.csv')\n",
"eval_nb = pd.read_csv('HASIL-RISET/evaluation_results_nb-new.csv')\n",
"eval_knn = pd.read_csv('HASIL-RISET/evaluation_results_knn-new.csv')\n",
"\n",
"# penggabungan data evaluation\n",
"# Menambahkan kolom 'model' ke setiap DataFrame\n",
"eval_svm['model'] = 'svm'\n",
"eval_nb['model'] = 'nb'\n",
"eval_knn['model'] = 'knn'\n",
"\n",
"# Mengatur ulang kolom agar 'model' menjadi kolom pertama\n",
"svm = eval_svm[['model'] + [col for col in eval_svm.columns if col != 'model']]\n",
"nb = eval_nb[['model'] + [col for col in eval_nb.columns if col != 'model']]\n",
"knn = eval_knn[['model'] + [col for col in eval_knn.columns if col != 'model']]\n",
"\n",
"# Menggabungkan semua DataFrame\n",
"combined_df = pd.concat([svm, nb, knn], axis=0, ignore_index=True)\n",
"\n",
"# Menampilkan hasil\n",
"print(combined_df.head())\n",
"\n",
"combined_df.to_csv('HASIL-RISET/evaluation_results_combine.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}