SteatRed: Updated assignment 1.22 and added some comments.

d7f99ea3 · Taddeüs Kroes · ba3207f2 · ba3207f2 · d7f99ea3 · d7f99ea3
Commit d7f99ea3 authored May 08, 2011 by Taddeüs Kroes
4 changed files
--- a/statred/ass1/Makefile
+++ b/statred/ass1/Makefile
-.PHONY: all clean
-
-all:
-
-clean:
-	rm -vf *.pyc q*.pdf
--- a/statred/ass1/q21_multivariate.py
+++ b/statred/ass1/q21_multivariate.py
 from pylab import array, eig, diagflat, dot, sqrt, randn, tile, \
-        plot, subplot, axis, figure, clf, savefig
+        plot, subplot, axis, figure, clf, show

 # The used mu (mean vector) and cov (covariance matrix).
 mu = array([[3],
@@ -18,7 +18,8 @@ cov  = array(
 samples = 1000
 vector_size = 4

-def dataset():
+def dataset(samples):
+    """Generate a dataset, consisting of a soecified number of random vectors."""
    # The covariance matrix is used to transform the generated dataset into a
    # multivariant normal distribution dataset.
    d, U = eig(cov)
@@ -31,12 +32,12 @@ if __name__ == '__main__':
    # Create a n*n grid of subplots and generate a new dataset.
    figure(vector_size**2)
    clf()
-    Y = dataset()
+    Y = dataset(samples)
    for i in range(vector_size):
        for j in range(vector_size):
            # Skip the diagonal subplots since those are irrelevant.
            if i != j:
-                subplot(vector_size, vector_size, (i+1) + j*vector_size)
+                subplot(vector_size, vector_size, (i + 1) + j * vector_size)
                plot(Y[i], Y[j], 'x')
                axis('equal')
-    savefig('q21.pdf')
+    show()
--- a/statred/ass1/q22_estimate.py
+++ b/statred/ass1/q22_estimate.py
-from q21_multivariate import dataset
+from sys import argv, exit
+from q21_multivariate import mu, cov, dataset
 from pylab import array, mean, tile, newaxis, dot, eigvals, \
-        axis, figure, clf, show, plot
+        axis, figure, clf, show, plot, sum

-def eigenvalues(n):
-    """Return eigenvalues of unbiased estimators for the covariance matrix
-    Sigma (based on a pseudo-random generated dataset)."""
+if len(argv) == 3:
+    step = int(argv[2])
+elif len(argv) == 2:
+    step = 100
+else:
+    print 'Usage: python %s SAMPLES [ STEP_SIZE ]' % (argv[0])
+    exit()

-    Y = array([mean(dataset(), 1) for i in range(n)]).T
+# `samples' is the size of the generated dataset.
+samples = int(argv[1])
+Y = dataset(samples)

+def estimate(n):
+    """Return eigenvalues of unbiased estimators for the covariance matrix
+    Sigma (based on a pseudo-random generated dataset)."""
    # Sigma = 1 / (n - 1) * Sum for i=1 to n: (x_i - x_mean) T(x_i - x_mean),
    # where T(x) is the transpose of `x'. Mu = x_mean and
    # Yzm = Sum for i=1 to n: x_i - x_mean.
-    mu = mean(Y, 1)
-    Yzm = Y - tile(mu[:,newaxis], n)
-    S = dot(Yzm, Yzm.T) / (n - 1)
-    return eigvals(S)
+    sliced = [Y[i][:n] for i in range(len(Y))]
+    est_mu = mean(sliced, 1)
+    Yzm = sliced - tile(est_mu[:,newaxis], n)
+    est_cov = dot(Yzm, Yzm.T) / (n - 1)
+    return (est_mu, est_cov)

 figure(1)
 clf()

-max_range = 10000
-
-samples = range(2, max_range, 500)
-data = [[] for i in range(4)]
-
-for n in samples:
-    e = eigenvalues(n)
-    for i in range(4):
-        data[i].append(e[i])
-for i in range(4):
-    plot(samples, data[i], 'x')
-axis([0, max_range, 0., 0.025])
+# Part 1 - Estimate mu and cov, experiment with various sizes of datasets.
+# We use steps of 100 for the number of used samples.
+X = range(step, samples + 1, step)
+diff_mu = []
+diff_cov = []
+estimated_mu = []
+for n in X:
+    est_mu, est_cov = estimate(n)
+    diff_mu.append(abs(sum(est_mu - mu)))
+    diff_cov.append(abs(sum(est_cov - cov)))
+    estimated_mu.append(est_mu)
+plot(X, diff_mu)
+plot(X, diff_cov)
+# Observe in the following graph that the accuracy increases when more
+# vectors from the generated dataset are used. There are some fluctuations due
+# to the randomness of the dataset, but the difference lines should converge to
+# zero.
 show()
+
+# Part 2 - Calculate covariance of estimated mu.
+sample_count = len(estimated_mu)
+estimated_mu = array(estimated_mu).T
+m = mean(estimated_mu, 1)
+Yzm = estimated_mu - tile(m[:,newaxis], sample_count)
+S = dot(Yzm, Yzm.T) / (samples - 1)
+# When many samples are used to calculate the estimators, the estimators will
+# converge to the original values. The more samples are uses each time, the more
+# convergence will occur, which will lead to smaller eigenvalues of the
+# covariance matrix (printed below). For example, call the program with 1000 and
+# 10000 samples and you will see that the eigenvalues will be smaller with 10000
+# samples.
+print eigvals(S)
--- a/statred/ass1/q23_iris.py
+++ b/statred/ass1/q23_iris.py
-from pylab import loadtxt, figure, plot, subplot, axis, clf, savefig
+from pylab import loadtxt, figure, plot, subplot, axis, clf, show

 # The last column of the data sets is a label, which is used to distinguish the
 # three groups of data in the data sets. This label should be translated to a
@@ -28,4 +28,8 @@ for i in range(4):
            for c in range(3):
                tmp = zip(*graph_data[i + j*4][c])
                plot(tmp[0], tmp[1], 'x' + colors[c])
-savefig('q23.pdf')
+# In the following plot, we can see that the colored areas do not overlap in
+# the same place in all subplots. Thus, using all plots, we could probably
+# classify a given data point to one of the iris classes (except for some points
+# in the blue/green areas, these overlap much in a small area).
+show()