Commit f6eeea9a authored by Taddeüs Kroes's avatar Taddeüs Kroes

StatRed ass4: Implemented part2 (k-means++).

parent 8abd27f0
from pylab import loadtxt, array, scatter, figure, show, mean, argmin, \ from pylab import loadtxt, array, scatter, figure, show, mean, argmin, \
ones, append, savefig append, savefig
from random import random, seed from random import random, seed, randint
from sys import argv, exit from sys import argv, exit
def init(X, k): def init(X, k):
return X[:k] """Simply use k random datapoints as initial means for the clusters."""
return X[[int(X.shape[0] * random()) for i in range(k)]]
def init_pp(X, k): def init_pp(X, k):
return X[:k] """Use the k-means++ algorithm to find initial means."""
# Choose first center at random
N = X.shape[0]
indices = [int(N * random())]
m = [X[indices[0]]]
D = [((x - m[0])**2).sum() for x in X]
while len(m) < k:
best_sum = new = -1
for i in range(N):
if i not in indices:
Dsum = sum([min(D[j], ((X[j] - X[i])**2).sum()) for j in range(N)])
if best_sum == -1 or best_sum < Dsum:
best_sum, new = Dsum, i
m.append(X[new])
indices.append(new)
D = [min(D[i], ((X[i] - X[new])**2).sum()) for i in range(N)]
return array(m)
if len(argv) == 3: # Parse parameters (600 is a nice example seed)
if argv[2] != 'pp': pp = False
print 'Usage: python %s K [ "pp" ]' % argv[0] if len(argv) > 2:
exit() if argv[2] == 'pp':
print 'Using k-means++' print 'Using k-means++'
initial_means = init_pp initial_means = init_pp
else: pp = True
else:
seed(int(argv[2]))
if len(argv) == 4:
seed(int(argv[3]))
elif len(argv) < 2:
print 'Usage: python %s K [ "pp" ] [ SEED ]' % argv[0]
exit()
if not pp:
print 'Using normal k-means' print 'Using normal k-means'
initial_means = init initial_means = init
k = int(argv[1]) k = int(argv[1])
if k < 1 or k > 6:
print 'K must be a value from 1-6'
exit()
# Generate dataset # Generate dataset, add a multiplication of k so that clusters are formed
seed(700)
n, N = 2, 100 n, N = 2, 100
X = array([[100 * random() for j in range(n)] for i in range(int(N / k + N % k))]) X = array([[100 * random() + 70 for j in range(n)] for i in \
range(int(N / k + N % k))])
for c in range(k - 1): for c in range(k - 1):
d = (k + 1) * 100 * random() d = (c + 2) * 70
X = append(X, [[100 * random() + d for j in range(n)] for i in \ X = append(X, [[100 * random() + d for j in range(n)] for i in \
range(int(N / k))], 0) range(int(N / k))], 0)
# Divide in clusters by applying k-means
M = initial_means(X, k) M = initial_means(X, k)
Mp = M - 1 Mp = M - 1
steps = 0 steps = 0
# Divide in clusters
while (Mp - M).any(): while (Mp - M).any():
Mp = M Mp = M
clusters = [[] for i in range(k)] clusters = [[] for i in range(k)]
...@@ -48,7 +76,7 @@ print 'Completed in %d steps' % steps ...@@ -48,7 +76,7 @@ print 'Completed in %d steps' % steps
# Plot clusters # Plot clusters
figure(1) figure(1)
colors = [[1,0,0], [0,1,0], [0,0,1]] colors = [[1,0,0], [0,1,0], [0,0,1], [0,0,0], [1,1,1], [.5,.5,.5]]
for i in range(k): for i in range(k):
c = array(clusters[i]) c = array(clusters[i])
scatter(c[:,0], c[:,1], c=colors[i]) scatter(c[:,0], c[:,1], c=colors[i])
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment