Commit d7f99ea3 authored by Taddeüs Kroes's avatar Taddeüs Kroes

SteatRed: Updated assignment 1.22 and added some comments.

parent ba3207f2
.PHONY: all clean
all:
clean:
rm -vf *.pyc q*.pdf
from pylab import array, eig, diagflat, dot, sqrt, randn, tile, \ from pylab import array, eig, diagflat, dot, sqrt, randn, tile, \
plot, subplot, axis, figure, clf, savefig plot, subplot, axis, figure, clf, show
# The used mu (mean vector) and cov (covariance matrix). # The used mu (mean vector) and cov (covariance matrix).
mu = array([[3], mu = array([[3],
...@@ -18,7 +18,8 @@ cov = array( ...@@ -18,7 +18,8 @@ cov = array(
samples = 1000 samples = 1000
vector_size = 4 vector_size = 4
def dataset(): def dataset(samples):
"""Generate a dataset, consisting of a soecified number of random vectors."""
# The covariance matrix is used to transform the generated dataset into a # The covariance matrix is used to transform the generated dataset into a
# multivariant normal distribution dataset. # multivariant normal distribution dataset.
d, U = eig(cov) d, U = eig(cov)
...@@ -31,12 +32,12 @@ if __name__ == '__main__': ...@@ -31,12 +32,12 @@ if __name__ == '__main__':
# Create a n*n grid of subplots and generate a new dataset. # Create a n*n grid of subplots and generate a new dataset.
figure(vector_size**2) figure(vector_size**2)
clf() clf()
Y = dataset() Y = dataset(samples)
for i in range(vector_size): for i in range(vector_size):
for j in range(vector_size): for j in range(vector_size):
# Skip the diagonal subplots since those are irrelevant. # Skip the diagonal subplots since those are irrelevant.
if i != j: if i != j:
subplot(vector_size, vector_size, (i+1) + j*vector_size) subplot(vector_size, vector_size, (i + 1) + j * vector_size)
plot(Y[i], Y[j], 'x') plot(Y[i], Y[j], 'x')
axis('equal') axis('equal')
savefig('q21.pdf') show()
from q21_multivariate import dataset from sys import argv, exit
from q21_multivariate import mu, cov, dataset
from pylab import array, mean, tile, newaxis, dot, eigvals, \ from pylab import array, mean, tile, newaxis, dot, eigvals, \
axis, figure, clf, show, plot axis, figure, clf, show, plot, sum
def eigenvalues(n): if len(argv) == 3:
"""Return eigenvalues of unbiased estimators for the covariance matrix step = int(argv[2])
Sigma (based on a pseudo-random generated dataset).""" elif len(argv) == 2:
step = 100
else:
print 'Usage: python %s SAMPLES [ STEP_SIZE ]' % (argv[0])
exit()
Y = array([mean(dataset(), 1) for i in range(n)]).T # `samples' is the size of the generated dataset.
samples = int(argv[1])
Y = dataset(samples)
def estimate(n):
"""Return eigenvalues of unbiased estimators for the covariance matrix
Sigma (based on a pseudo-random generated dataset)."""
# Sigma = 1 / (n - 1) * Sum for i=1 to n: (x_i - x_mean) T(x_i - x_mean), # Sigma = 1 / (n - 1) * Sum for i=1 to n: (x_i - x_mean) T(x_i - x_mean),
# where T(x) is the transpose of `x'. Mu = x_mean and # where T(x) is the transpose of `x'. Mu = x_mean and
# Yzm = Sum for i=1 to n: x_i - x_mean. # Yzm = Sum for i=1 to n: x_i - x_mean.
mu = mean(Y, 1) sliced = [Y[i][:n] for i in range(len(Y))]
Yzm = Y - tile(mu[:,newaxis], n) est_mu = mean(sliced, 1)
S = dot(Yzm, Yzm.T) / (n - 1) Yzm = sliced - tile(est_mu[:,newaxis], n)
return eigvals(S) est_cov = dot(Yzm, Yzm.T) / (n - 1)
return (est_mu, est_cov)
figure(1) figure(1)
clf() clf()
max_range = 10000 # Part 1 - Estimate mu and cov, experiment with various sizes of datasets.
# We use steps of 100 for the number of used samples.
samples = range(2, max_range, 500) X = range(step, samples + 1, step)
data = [[] for i in range(4)] diff_mu = []
diff_cov = []
for n in samples: estimated_mu = []
e = eigenvalues(n) for n in X:
for i in range(4): est_mu, est_cov = estimate(n)
data[i].append(e[i]) diff_mu.append(abs(sum(est_mu - mu)))
for i in range(4): diff_cov.append(abs(sum(est_cov - cov)))
plot(samples, data[i], 'x') estimated_mu.append(est_mu)
axis([0, max_range, 0., 0.025]) plot(X, diff_mu)
plot(X, diff_cov)
# Observe in the following graph that the accuracy increases when more
# vectors from the generated dataset are used. There are some fluctuations due
# to the randomness of the dataset, but the difference lines should converge to
# zero.
show() show()
# Part 2 - Calculate covariance of estimated mu.
sample_count = len(estimated_mu)
estimated_mu = array(estimated_mu).T
m = mean(estimated_mu, 1)
Yzm = estimated_mu - tile(m[:,newaxis], sample_count)
S = dot(Yzm, Yzm.T) / (samples - 1)
# When many samples are used to calculate the estimators, the estimators will
# converge to the original values. The more samples are uses each time, the more
# convergence will occur, which will lead to smaller eigenvalues of the
# covariance matrix (printed below). For example, call the program with 1000 and
# 10000 samples and you will see that the eigenvalues will be smaller with 10000
# samples.
print eigvals(S)
from pylab import loadtxt, figure, plot, subplot, axis, clf, savefig from pylab import loadtxt, figure, plot, subplot, axis, clf, show
# The last column of the data sets is a label, which is used to distinguish the # The last column of the data sets is a label, which is used to distinguish the
# three groups of data in the data sets. This label should be translated to a # three groups of data in the data sets. This label should be translated to a
...@@ -28,4 +28,8 @@ for i in range(4): ...@@ -28,4 +28,8 @@ for i in range(4):
for c in range(3): for c in range(3):
tmp = zip(*graph_data[i + j*4][c]) tmp = zip(*graph_data[i + j*4][c])
plot(tmp[0], tmp[1], 'x' + colors[c]) plot(tmp[0], tmp[1], 'x' + colors[c])
savefig('q23.pdf') # In the following plot, we can see that the colored areas do not overlap in
# the same place in all subplots. Thus, using all plots, we could probably
# classify a given data point to one of the iris classes (except for some points
# in the blue/green areas, these overlap much in a small area).
show()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment