1 """
2 generate toy datasets
3 based on code by Mark Rogers
4 """
5
6 from math import sin, pi
7 from random import gauss
8 import numpy
9 import datafunc
10
11
12
13
14 -def sineClass(xlim=[0,1], ylim=[0,1], n=20, sigma = 0.04) :
15 """
16 Generates a 2-D noisy sine wave
17 Parameters:
18 xlim - list of length 2 that delimits the x value range
19 ylim - list of length 2 that delimits the y value range
20 n - number of data points
21 Note: for use with PyML demo2d, only use x and y values
22 between -1 and 1
23 """
24 minx = min(xlim)
25 dx = float(max(xlim)-minx)/n
26 yrange = max(ylim)-min(ylim)
27 miny = min(ylim)
28 gamma = float(yrange)/2.0
29 X = []
30 for i in xrange(n) :
31 xval = i*dx
32 newx = minx + xval + gauss(0,sigma)
33 newy = miny + gamma*sin(xval*pi*2) + gauss(0,sigma)
34 X.append([newx, newy])
35
36 return X
37
39 """
40 a wrapper around numpy's random.multivariate_normal function
41 Generates data from a Gaussian distribution with mean mu
42 and standard deviation sigma
43 Parameters:
44 mu - mean
45 sigma - variance (either a float, list or square matrix)
46 n - number of points to generate
47
48 Note: for use with PyML demo2d, only use mu1 and mu2
49 values that keep populations between -1 and 1
50 """
51
52 dim = len(mu)
53 if type(sigma) == type(1.0) or type(sigma) == type(1) :
54 sigma = numpy.diag([sigma] * dim)
55 else :
56 sigma = numpy.array(sigma)
57 if sigma.ndim == 1 :
58 sigma = numpy.diag(sigma)
59 else :
60 assert sigma.shape[0] == sigma.shape[1]
61
62 return numpy.random.multivariate_normal(mu, sigma, n)
63
65
66 numClasses = len(mu)
67 if len(sigma) == 1 :
68 sigma = [sigma for i in range(numClasses)]
69 if len(n) == 1 :
70 n = [n for i in range(numClasses)]
71
72 Y = []
73 for i in range(numClasses) :
74 Y.extend([str(i) for j in range(n[i])])
75
76 X = []
77 for i in range(numClasses) :
78 print mu[i], sigma[i], n[i]
79 X.extend(multivariate_normal(mu[i], sigma[i], n[i]).tolist())
80
81 return datafunc.VectorDataSet(X, L = Y)
82
84 """
85 Creates two populations, usually linearly-separable, but with
86 vastly different variance. Simulates a problem where one
87 population has significantly more noise than another. Data are
88 output in a CSV format suitable for creating a PyML VectorDataSet
89 (labelsColumn=1).
90 """
91 pid = 0
92 for label in [-1,1] :
93 if label < 0 :
94 X,Y = gaussCloud(-0.5, 0.0, sigma=0.05, n=20)
95 else :
96 X,Y = gaussCloud(0.3, 0.0, sigma=0.25, n=20)
97 for i in xrange(len(X)) :
98 pid += 1
99 print "%(p)d,%(l)d,%(x)f,%(y)f" % {'p':pid, 'l':label, 'x':X[i], 'y':Y[i]}
100
102 """
103 Uses sine-wave populations to create two class populations that
104 meander close to each other. Data are output in a CSV format
105 suitable for creating a PyML VectorDataSet (labelsColumn=1).
106 """
107 pid = 0
108 lim = 0.8
109 X = []
110 Y = []
111 for label in [-1,1] :
112 if label > 0 :
113 X.extend(sineClass([-lim,lim], [0, 0.6], n))
114 else :
115 X.extend(sineClass([-lim,lim], [-0.4, 0.2], n))
116 Y.extend([str(label) for i in range(n)])
117
118 return datafunc.VectorDataSet(X, L = Y)
119
121 """
122 Creates two linearly-separable populations, one centered
123 at (-.5,0) and the other at (0.5,0). Data are output in
124 a CSV format suitable for creating a PyML VectorDataSet
125 (labelsColumn=1).
126 """
127 pid = 0
128 for label in [-1,1] :
129 if label < 0 :
130 X,Y = gaussCloud(-0.5, 0.0, sigma=0.2, n=20)
131 else :
132 X,Y = gaussCloud(0.5, 0.0, sigma=0.2, n=20)
133 for i in xrange(len(X)) :
134 pid += 1
135 print "%(p)d,%(l)d,%(x)f,%(y)f" % {'p':pid, 'l':label, 'x':X[i], 'y':Y[i]}
136
137
138 USAGE = """
139 Usage: python generate.py type
140 Where 'type' is one of:
141 l - two similar, linearly-separable populations
142 n - two linearly-separable populations, one with more
143 noise than the other
144 s - two populations generated by sine waves (with some noise)
145 """
146
147 if __name__ == '__main__' :
148 import sys
149 if len(sys.argv) != 2 :
150 print USAGE
151 sys.exit(1)
152 type = sys.argv[1]
153 if type == 'l' :
154 separableData()
155 elif type == 'n' :
156 noisyData()
157 elif type == 's' :
158 curvyData()
159 else :
160 print "Unrecognized data generation type:", type
161