This is the program function code for clustering using k-medoids
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
# randomly initialize an array of k medoid indices
M = np.sort(np.random.choice(n, k)
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in range(tmax):
# determine clusters, i.e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
and the I will call The function KMedoids with this program, I think my program run slowly in line D = Pairwise_distances(arraydata, metric='euclidean')
D = pairwise_distances(arraydata,metric='euclidean')
# split into 2 clusters
M, C = kMedoids(D, 2)
print('medoids:')
for point_idx in M:
print(arraydata[point_idx] )
print('')
# array for get label
temp = []
indeks = []
print('clustering result:')
for label in C:
for point_idx in C[label]:
print('label {0}: {1}'.format(label, arraydata[point_idx]))
temp.append(label)
indeks.append(point_idx)
This is the result from this program
clustering result:
label 0: [0.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
1.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
Why my result of my program is slow for large data and almost have a result "Memory Error"? I hope someone can help me to review this code to improve its performance to get the result and process large amounts of data.
return M, Clooks misindented. Please doublecheck your indentation. \$\endgroup\$