Package PyML :: Package datagen :: Module toydata
[frames] | no frames]

Source Code for Module PyML.datagen.toydata

  1  """
 
  2  generate toy datasets
 
  3  based on code by Mark Rogers
 
  4  """ 
  5  
 
  6  from math   import sin, pi 
  7  from random import gauss 
  8  import numpy 
  9  import datafunc 
 10  
 
 11  ##
 
 12  # Class generators:
 
 13  ##
 
14 -def sineClass(xlim=[0,1], ylim=[0,1], n=20, sigma = 0.04) :
15 """ 16 Generates a 2-D noisy sine wave 17 Parameters: 18 xlim - list of length 2 that delimits the x value range 19 ylim - list of length 2 that delimits the y value range 20 n - number of data points 21 Note: for use with PyML demo2d, only use x and y values 22 between -1 and 1 23 """ 24 minx = min(xlim) 25 dx = float(max(xlim)-minx)/n 26 yrange = max(ylim)-min(ylim) 27 miny = min(ylim) 28 gamma = float(yrange)/2.0 29 X = [] 30 for i in xrange(n) : 31 xval = i*dx 32 newx = minx + xval + gauss(0,sigma) 33 newy = miny + gamma*sin(xval*pi*2) + gauss(0,sigma) 34 X.append([newx, newy]) 35 36 return X
37
38 -def multivariate_normal(mu, sigma=0.1, n=20) :
39 """ 40 a wrapper around numpy's random.multivariate_normal function 41 Generates data from a Gaussian distribution with mean mu 42 and standard deviation sigma 43 Parameters: 44 mu - mean 45 sigma - variance (either a float, list or square matrix) 46 n - number of points to generate 47 48 Note: for use with PyML demo2d, only use mu1 and mu2 49 values that keep populations between -1 and 1 50 """ 51 52 dim = len(mu) 53 if type(sigma) == type(1.0) or type(sigma) == type(1) : 54 sigma = numpy.diag([sigma] * dim) 55 else : 56 sigma = numpy.array(sigma) 57 if sigma.ndim == 1 : 58 sigma = numpy.diag(sigma) 59 else : 60 assert sigma.shape[0] == sigma.shape[1] 61 62 return numpy.random.multivariate_normal(mu, sigma, n)
63
64 -def gaussianData(mu, sigma, n) :
65 66 numClasses = len(mu) 67 if len(sigma) == 1 : 68 sigma = [sigma for i in range(numClasses)] 69 if len(n) == 1 : 70 n = [n for i in range(numClasses)] 71 72 Y = [] 73 for i in range(numClasses) : 74 Y.extend([str(i) for j in range(n[i])]) 75 76 X = [] 77 for i in range(numClasses) : 78 print mu[i], sigma[i], n[i] 79 X.extend(multivariate_normal(mu[i], sigma[i], n[i]).tolist()) 80 81 return datafunc.VectorDataSet(X, L = Y)
82
83 -def noisyData() :
84 """ 85 Creates two populations, usually linearly-separable, but with 86 vastly different variance. Simulates a problem where one 87 population has significantly more noise than another. Data are 88 output in a CSV format suitable for creating a PyML VectorDataSet 89 (labelsColumn=1). 90 """ 91 pid = 0 92 for label in [-1,1] : 93 if label < 0 : 94 X,Y = gaussCloud(-0.5, 0.0, sigma=0.05, n=20) 95 else : 96 X,Y = gaussCloud(0.3, 0.0, sigma=0.25, n=20) 97 for i in xrange(len(X)) : 98 pid += 1 99 print "%(p)d,%(l)d,%(x)f,%(y)f" % {'p':pid, 'l':label, 'x':X[i], 'y':Y[i]}
100
101 -def sineData(n = 30) :
102 """ 103 Uses sine-wave populations to create two class populations that 104 meander close to each other. Data are output in a CSV format 105 suitable for creating a PyML VectorDataSet (labelsColumn=1). 106 """ 107 pid = 0 108 lim = 0.8 109 X = [] 110 Y = [] 111 for label in [-1,1] : 112 if label > 0 : 113 X.extend(sineClass([-lim,lim], [0, 0.6], n)) 114 else : 115 X.extend(sineClass([-lim,lim], [-0.4, 0.2], n)) 116 Y.extend([str(label) for i in range(n)]) 117 118 return datafunc.VectorDataSet(X, L = Y)
119
120 -def separableData() :
121 """ 122 Creates two linearly-separable populations, one centered 123 at (-.5,0) and the other at (0.5,0). Data are output in 124 a CSV format suitable for creating a PyML VectorDataSet 125 (labelsColumn=1). 126 """ 127 pid = 0 128 for label in [-1,1] : 129 if label < 0 : 130 X,Y = gaussCloud(-0.5, 0.0, sigma=0.2, n=20) 131 else : 132 X,Y = gaussCloud(0.5, 0.0, sigma=0.2, n=20) 133 for i in xrange(len(X)) : 134 pid += 1 135 print "%(p)d,%(l)d,%(x)f,%(y)f" % {'p':pid, 'l':label, 'x':X[i], 'y':Y[i]}
136 137 ## Main: 138 USAGE = """ 139 Usage: python generate.py type 140 Where 'type' is one of: 141 l - two similar, linearly-separable populations 142 n - two linearly-separable populations, one with more 143 noise than the other 144 s - two populations generated by sine waves (with some noise) 145 """ 146 147 if __name__ == '__main__' : 148 import sys 149 if len(sys.argv) != 2 : 150 print USAGE 151 sys.exit(1) 152 type = sys.argv[1] 153 if type == 'l' : 154 separableData() 155 elif type == 'n' : 156 noisyData() 157 elif type == 's' : 158 curvyData() 159 else : 160 print "Unrecognized data generation type:", type 161