194 lines
10 KiB
Python
194 lines
10 KiB
Python
import math
|
|
import numpy as np
|
|
|
|
# Data dalam bentuk dictionary
|
|
datas = [
|
|
{'id_product': '35509867', 'sold': 367, 'rating_score': 4.897, 'count_review': 198, 'cluster': 0},
|
|
{'id_product': '35509877', 'sold': 0, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35507877', 'sold': 1250, 'rating_score': 4.567, 'count_review': 987, 'cluster': 0},
|
|
{'id_product': '35589857', 'sold': 20, 'rating_score': 4.9, 'count_review': 18, 'cluster': 0},
|
|
{'id_product': '35309877', 'sold': 128, 'rating_score': 4.708, 'count_review': 99, 'cluster': 0},
|
|
{'id_product': '35509887', 'sold': 2008, 'rating_score': 4.67, 'count_review': 1702, 'cluster': 0},
|
|
{'id_product': '33569877', 'sold': 367, 'rating_score': 4.897, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35509877', 'sold': 30, 'rating_score': 4.897, 'count_review': 19, 'cluster': 0},
|
|
{'id_product': '35519807', 'sold': 1, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35539872', 'sold': 188, 'rating_score': 4.755, 'count_review': 184, 'cluster': 0},
|
|
{'id_product': '35509571', 'sold': 0, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35709677', 'sold': 3, 'rating_score': 5, 'count_review': 2, 'cluster': 0},
|
|
{'id_product': '35579822', 'sold': 45, 'rating_score': 4.865, 'count_review': 24, 'cluster': 0},
|
|
{'id_product': '35509867', 'sold': 367, 'rating_score': 4.897, 'count_review': 198, 'cluster': 0},
|
|
{'id_product': '35509877', 'sold': 340, 'rating_score': 4.634, 'count_review': 298, 'cluster': 0},
|
|
{'id_product': '35507877', 'sold': 1098, 'rating_score': 4.497, 'count_review': 1023, 'cluster': 0},
|
|
{'id_product': '35589857', 'sold': 16, 'rating_score': 4.9, 'count_review': 12, 'cluster': 0},
|
|
{'id_product': '35309877', 'sold': 156, 'rating_score': 4.8, 'count_review': 88, 'cluster': 0},
|
|
{'id_product': '35509887', 'sold': 10, 'rating_score': 4.8, 'count_review': 9, 'cluster': 0},
|
|
{'id_product': '33569877', 'sold': 2, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35509877', 'sold': 29, 'rating_score': 4.897, 'count_review': 9, 'cluster': 0},
|
|
{'id_product': '35519807', 'sold': 1, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35539872', 'sold': 2409, 'rating_score': 4.655, 'count_review': 1902, 'cluster': 0},
|
|
{'id_product': '35509571', 'sold': 0, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35709677', 'sold': 15, 'rating_score': 5, 'count_review': 10, 'cluster': 0},
|
|
{'id_product': '35579822', 'sold': 72, 'rating_score': 4.865, 'count_review': 34, 'cluster': 0},
|
|
{'id_product': '35539872', 'sold': 208, 'rating_score': 4.755, 'count_review': 50, 'cluster': 0},
|
|
{'id_product': '35509571', 'sold': 4, 'rating_score': 5, 'count_review': 2, 'cluster': 0},
|
|
{'id_product': '35709677', 'sold': 13, 'rating_score': 4.9, 'count_review': 6, 'cluster': 0},
|
|
{'id_product': '35579822', 'sold': 82, 'rating_score': 4.885, 'count_review': 44, 'cluster': 0},
|
|
{'id_product': '35509867', 'sold': 450, 'rating_score': 4.797, 'count_review': 345, 'cluster': 0},
|
|
{'id_product': '35509877', 'sold': 1, 'rating_score': 5, 'count_review': 1, 'cluster': 0},
|
|
{'id_product': '35507877', 'sold': 1198, 'rating_score': 4.597, 'count_review': 903, 'cluster': 0},
|
|
{'id_product': '35589857', 'sold': 16, 'rating_score': 4.9, 'count_review': 12, 'cluster': 0},
|
|
{'id_product': '35309877', 'sold': 156, 'rating_score': 4.8, 'count_review': 88, 'cluster': 0},
|
|
{'id_product': '35509887', 'sold': 10, 'rating_score': 4.8, 'count_review': 9, 'cluster': 0},
|
|
{'id_product': '33569877', 'sold': 2, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35509877', 'sold': 29, 'rating_score': 4.897, 'count_review': 9, 'cluster': 0},
|
|
{'id_product': '35519807', 'sold': 1, 'rating_score': 0, 'count_review': 0, 'cluster': 0},
|
|
{'id_product': '35539872', 'sold': 1569, 'rating_score': 4.655, 'count_review': 1202, 'cluster': 0},
|
|
{'id_product': '35509571', 'sold': 567, 'rating_score': 4.876, 'count_review': 450, 'cluster': 0},
|
|
{'id_product': '35709677', 'sold': 55, 'rating_score': 4.895, 'count_review': 23, 'cluster': 0},
|
|
{'id_product': '35579822', 'sold': 92, 'rating_score': 4.587, 'count_review': 86, 'cluster': 0},
|
|
# ... tambahkan data lainnya jika ada
|
|
]
|
|
|
|
def cluster(data) :
|
|
# Fitur yang akan digunakan untuk clustering
|
|
features = ['sold', 'rating_score', 'count_review']
|
|
|
|
# Jumlah kluster yang diinginkan
|
|
num_clusters = 3
|
|
|
|
# Pusat kluster awal yang Anda tentukan (dalam bentuk list of dictionaries)
|
|
initial_centers = [
|
|
{'sold': 15, 'rating_score': 4.8, 'count_review': 10},
|
|
{'sold': 50, 'rating_score': 4.9, 'count_review': 50},
|
|
{'sold': 100, 'rating_score': 5, 'count_review': 100}
|
|
]
|
|
# Mengubah initial_centers menjadi satu list yang berisi nilai fitur secara berurutan
|
|
centers = np.array([initial_centers[i][feature] for feature in features for i in range(num_clusters)])
|
|
|
|
# Maksimum iterasi
|
|
max_iters = 100
|
|
|
|
for _ in range(max_iters):
|
|
# Menyimpan hasil kluster untuk setiap data point
|
|
cluster_assignments = []
|
|
|
|
# Assign data point ke kluster terdekat
|
|
for item in data:
|
|
distances = []
|
|
for center_idx in range(num_clusters):
|
|
distance = 0
|
|
for feature in features:
|
|
distance += (item[feature] - centers[center_idx * len(features) + features.index(feature)]) ** 2
|
|
distances.append(math.sqrt(distance))
|
|
cluster_assignments.append(distances.index(min(distances)))
|
|
|
|
# Menghitung pusat baru untuk setiap kluster
|
|
new_centers = np.zeros((num_clusters, len(features)))
|
|
cluster_counts = np.zeros(num_clusters)
|
|
for i, item in enumerate(data):
|
|
cluster_idx = cluster_assignments[i]
|
|
for feature_idx, feature in enumerate(features):
|
|
new_centers[cluster_idx, feature_idx] += item[feature]
|
|
cluster_counts[cluster_idx] += 1
|
|
for cluster_idx in range(num_clusters):
|
|
new_centers[cluster_idx] /= cluster_counts[cluster_idx]
|
|
|
|
# Mengecek apakah pusat kluster sudah konvergen
|
|
if np.all(centers == new_centers):
|
|
break
|
|
|
|
centers = new_centers.flatten()
|
|
|
|
# Memperbarui nilai 'cluster' di setiap item data
|
|
for i, item in enumerate(data):
|
|
item['cluster'] = cluster_assignments[i]
|
|
|
|
# Menampilkan hasil kluster untuk setiap key 'cluster'
|
|
# for cluster_id in range(num_clusters):
|
|
# clustered_data = [item for item in data if item['cluster'] == cluster_id]
|
|
# print(f"Cluster {cluster_id}:")
|
|
# for item in clustered_data:
|
|
# print(item)
|
|
# print("\n")
|
|
|
|
# print (data)
|
|
return data
|
|
|
|
# print(cluster(datas))
|
|
|
|
|
|
def clustering(params) :
|
|
|
|
data = params #ambil array dari parameter
|
|
hasil_cluster = [] # untuk menampung hasil cluster sementara
|
|
|
|
num_cluster = 3 # tentukan jumlah cluster
|
|
|
|
# inisialisasi titik pusat cluster / centroid
|
|
centroids = {
|
|
'c1' : {'sold' : 25, 'rating_score': 4.7, 'count_review' : 25},
|
|
'c2' : {'sold' : 100, 'rating_score': 4.8, 'count_review' : 100},
|
|
'c3' : {'sold' : 250, 'rating_score': 4.9, 'count_review' : 250},
|
|
}
|
|
|
|
max_iteration = 20 # atur iterasi maksimal
|
|
|
|
for _ in range(max_iteration) :
|
|
# centroids = init_centroids
|
|
sum_c1 = {'sold' : 0, 'rating_score' : 0, 'count_review' : 0}
|
|
sum_c2 = {'sold' : 0, 'rating_score' : 0, 'count_review' : 0}
|
|
sum_c3 = {'sold' : 0, 'rating_score' : 0, 'count_review' : 0}
|
|
jml_c1 = 0
|
|
jml_c2 = 0
|
|
jml_c3 = 0
|
|
|
|
for i in range(len(data)) :
|
|
distance = []
|
|
# print(item)
|
|
c1 = math.sqrt((data[i]['sold'] - centroids['c1']['sold']) ** 2 + (data[i]['rating_score'] - centroids['c1']['rating_score']) ** 2 + (data[i]['count_review'] - centroids['c1']['count_review']) ** 2)
|
|
c2 = math.sqrt((data[i]['sold'] - centroids['c2']['sold']) ** 2 + (data[i]['rating_score'] - centroids['c2']['rating_score']) ** 2 + (data[i]['count_review'] - centroids['c2']['count_review']) ** 2)
|
|
c3 = math.sqrt((data[i]['sold'] - centroids['c3']['sold']) ** 2 + (data[i]['rating_score'] - centroids['c3']['rating_score']) ** 2 + (data[i]['count_review'] - centroids['c3']['count_review']) ** 2)
|
|
# print(c1, c2, c3)
|
|
if (c1 < c2 and c1 < c3) :
|
|
sum_c1['sold'] += data[i]['sold']
|
|
sum_c1['rating_score'] += data[i]['rating_score']
|
|
sum_c1['count_review'] += data[i]['count_review']
|
|
jml_c1 += 1
|
|
data[i]['cluster'] = 1
|
|
|
|
if (c2 < c1 and c2 < c3) :
|
|
sum_c2['sold'] += data[i]['sold']
|
|
sum_c2['rating_score'] += data[i]['rating_score']
|
|
sum_c2['count_review'] += data[i]['count_review']
|
|
jml_c2 += 1
|
|
data[i]['cluster'] = 2
|
|
|
|
if (c3 < c1 and c3 < c2) :
|
|
sum_c3['sold'] += data[i]['sold']
|
|
sum_c3['rating_score'] += data[i]['rating_score']
|
|
sum_c3['count_review'] += data[i]['count_review']
|
|
jml_c3 += 1
|
|
data[i]['cluster'] = 3
|
|
|
|
# print(data[i])
|
|
|
|
print(jml_c1, jml_c2, jml_c3)
|
|
|
|
# buat centroid baru
|
|
new_centroid = {
|
|
'c1' : {'sold' : round(sum_c1['sold'] / jml_c1, 2), 'rating_score': round(sum_c1['rating_score'] / jml_c1, 2), 'count_review' : round(sum_c1['count_review'] / jml_c1, 2)},
|
|
'c2' : {'sold' : round(sum_c2['sold'] / jml_c2, 2), 'rating_score': round(sum_c2['rating_score'] / jml_c2, 2), 'count_review' : round(sum_c2['count_review'] / jml_c2, 2)},
|
|
'c3' : {'sold' : round(sum_c3['sold'] / jml_c3, 2), 'rating_score': round(sum_c3['rating_score'] / jml_c3, 2), 'count_review' : round(sum_c3['count_review'] / jml_c3, 2)},
|
|
}
|
|
|
|
if (centroids == new_centroid) :
|
|
break
|
|
|
|
centroids = new_centroid
|
|
|
|
# print(centroids)
|
|
|
|
# print(data)
|
|
return data
|
|
|
|
print(clustering(datas)) |