Package PyML :: Package containers :: Module baseDatasets
[frames] | no frames]

Source Code for Module PyML.containers.baseDatasets

  1  import copy 
  2  import numpy 
  3   
  4  from PyML.containers import ker 
  5  from PyML.containers import parsers 
  6  from PyML.utils import misc 
  7  from PyML.containers.labels import Labels 
  8   
9 -class BaseDataSet (object) :
10 """ 11 A base class for PyML dataset containers 12 13 """ 14 type = 'dataset' 15 isVector = False # is the dataset Euclidean 16
17 - def __init__(self) :
18 19 self.isTrained = False 20 self.isTested = False
21
22 - def setTrainingFunc(self, func) :
23 24 assert func is None or type(func).__name__ == 'function' 25 self._trainingFunc = func
26
27 - def getTrainingFunc(self) :
28 29 if hasattr(self, '_trainingFunc') : 30 return self._trainingFunc 31 else : 32 return None
33 34 trainingFunc = property(getTrainingFunc, setTrainingFunc, 35 None, '_trainingFunc') 36
37 - def setTestingFunc(self, func) :
38 39 assert func is None or type(func).__name__ == 'function' 40 self._testingFunc = func
41
42 - def getTestingFunc(self) :
43 44 if hasattr(self, '_testingFunc') : 45 return self._testingFunc 46 else : 47 return None
48 49 testingFunc = property(getTestingFunc, setTestingFunc, 50 None, '_testingFunc') 51
52 - def train(self, **args) :
53 54 if self.trainingFunc is not None and not self.isTrained : 55 self.trainingFunc(self, **args) 56 self.isTrained = True
57
58 - def test(self, trainingData, **args) :
59 60 if self.testingFunc is not None and not self.isTested : 61 self.testingFunc(self, trainingData, **args) 62 self.isTested = True
63
64 - def registerAttribute(self, attributeName, attributeValue = None, action = None) :
65 66 if not hasattr(self, '_registeredAttributes') : 67 self._registeredAttributes = [attributeName] 68 else : 69 self._registeredAttributes.append(attributeName) 70 if attributeValue is not None : 71 setattr(self, attributeName, attributeValue) 72 if not hasattr(self, '_actions') : self._actions = {} 73 self._actions[attributeName] = action
74
75 - def copyConstruct(self, other, **args) :
76 77 forgetClassLabels = False 78 if "patterns" in args: 79 patterns = args['patterns'] 80 # if the patterns are ids (strings) convert them to indices: 81 if type(patterns[0]) == type('') : 82 idDict = misc.list2dict(patterns) 83 patternsToCopy = [i for i in range(len(other)) 84 if other.labels.patternID[i] in idDict] 85 else : 86 patternsToCopy = patterns 87 elif "classes" in args : 88 patternsToCopy = [i for i in range(len(other)) 89 if other.labels.L[i] in args["classes"]] 90 forgetClassLabels = True 91 elif "classID" in args : 92 patternsToCopy = [i for i in range(len(other)) 93 if other.labels.Y[i] in args["classID"]] 94 forgetClassLabels = True 95 else : 96 patternsToCopy = range(len(other)) 97 98 self.setTrainingFunc(other.trainingFunc) 99 self.setTestingFunc(other.testingFunc) 100 101 deepcopy = True 102 if 'deepcopy' in args : deepcopy = args['deepcopy'] 103 # class dependent copying of data: 104 self.copy(other, patternsToCopy, deepcopy) 105 106 self.attachKernel(other) 107 self.attachLabels(Labels(other.labels, 108 patterns = patternsToCopy, 109 forgetClassLabels = forgetClassLabels)) 110 111 # copy the registered attribute: 112 if hasattr(other, '_registeredAttributes') : 113 self._registeredAttributes = other._registeredAttributes[:] 114 self._actions = copy.deepcopy(other._actions) 115 for attr in self._registeredAttributes : 116 a = getattr(other, attr) 117 if type(a) == type([]) : 118 if len(a) != len(other) : 119 raise ValueError, 'attribute has bad length' 120 #BaseDataSet.__setattr__(self, attr, 121 # [a[i] for i in patternsToCopy]) 122 setattr(self, attr, [a[i] for i in patternsToCopy]) 123 elif hasattr(a, 'type') and a.type == 'dataset' and len(a) == len(self) : 124 acopy = a.__class__(a, patterns = patternsToCopy) 125 setattr(self, attr, acopy) 126 else : 127 setattr(self, attr, a)
128
129 - def copy(self, other, patterns, deepcopy) :
130 """ 131 Each class that wants to use the generic copy constructor needs 132 to define this function for doing class-specific copying""" 133 134 raise NotImplementedError
135
136 - def getKernelMatrix(self) :
137 """ 138 returns the kernel matrix as a numpy array 139 """ 140 141 kvec = self.getKernelMatrixAsVector() 142 return numpy.reshape(kvec, (len(self), len(self)))
143
144 - def attachKernel(self, kernel = 'linear', **args) :
145 146 if type(kernel) == type('') : 147 kernel = kernel.lower() 148 if kernel == 'linear' or kernel == 'lin' : 149 self.kernel = ker.Linear() 150 elif kernel == 'polynomial' or kernel == 'poly' : 151 self.kernel = ker.Polynomial(**args) 152 elif kernel == 'rbf' or kernel == 'gaussian' : 153 self.kernel = ker.Gaussian(**args) 154 else : 155 raise ValueError, 'unrecognized type of kernel' 156 157 elif hasattr(kernel, 'type') and kernel.type == 'dataset' : 158 data = kernel 159 self.kernel = data.kernel.__class__(data.kernel) 160 elif hasattr(kernel, 'type') and kernel.type == 'kernel' : 161 self.kernel = kernel.__class__(kernel)
162
163 - def attachLabels(self, labels) :
164 165 if labels.__class__.__name__ == 'Labels' : 166 pass 167 elif type(labels) == type('') : 168 labels = Labels(labels) 169 else : 170 raise ValueError, 'wrong type of labels object' 171 if len(self) != len(labels) : 172 raise ValueError, 'length of labels not equal length of self' 173 self.labels = labels
174 175
176 -class BaseVectorDataSet (BaseDataSet) :
177 """A base class for vector dataset container classes 178 179 Construction:: 180 181 DataSet(fileName) - read data from a file 182 DataSet(fileName, classes = listOfClasses) - read only the 183 classes that are named in listOfClasses 184 DataSet(otherDataSet) - copy construction 185 DataSet(otherDataSet, patterns = listOfPatterns) - copy construction 186 using a list of patterns to copy 187 DataSet(otherDataSet, classes = classesToCopy) - copy construction 188 using a list of classes to copy 189 190 Keywords:: 191 192 deepcopy - whether to deepcopy a dataset (default = True) 193 The only container that implements a shallow copy is the SparseDataSet. 194 195 Usage/attributes:: 196 197 len(dataset) - the number of patterns 198 numFeatures - the number of features in the data (when applicable) 199 """ 200 201 isVector = True # is the dataset Euclidean 202 verbose = 1 203
204 - def __init__(self, arg=None, **args):
205 206 BaseDataSet.__init__(self) 207 self.featureID = None 208 # copy construction: 209 if arg.__class__ == self.__class__ : 210 self.copyConstruct(arg, **args) 211 return 212 # construct from a file: 213 elif type(arg) == type('') : 214 self.constructFromFile(arg, **args) 215 # construct from a list or numpy array: 216 elif (type(arg) == type([]) or type(arg) == numpy.ndarray) : 217 self.fromArray(arg, **args) 218 else: 219 raise ValueError, 'wrong type of arg' 220 221 if 'kernel' in args : 222 ker = args['kernel'] 223 del args['kernel'] 224 self.attachKernel(ker, **args) 225 else : 226 self.attachKernel('linear')
227 228
229 - def constructFromFile(self, fileName, **args) :
230 231 parser = parsers.parserDispatcher(fileName, **args) 232 # the DataSet container can only be used with a csv type file: 233 if parser.__class__.__name__ == 'SparseParser' and \ 234 self.__class__.__name__ == 'DataSet' : 235 raise ValueError, \ 236 'cannot use a DataSet container with a sparse file' 237 parser.scan() 238 239 self.initializeDataMatrix(len(parser), len(parser._featureID)) 240 241 # read the patterns : 242 i = 0 243 for x in parser : 244 self.addPattern(x, i) 245 i += 1 246 if i % 100 == 0 : 247 print 'read',i,'patterns' 248 249 # postprocessing: 250 L = parser._labels 251 patternID = parser._patternID 252 if patternID is None or len(patternID) == 0 : 253 patternID = [str(i) for i in range(1, len(self) + 1)] 254 self.featureID, featureKey, featureKeyDict = parser.postProcess() 255 if self.__class__.__name__ == 'PySparseDataSet' : 256 self.featureKey = featureKey 257 self.featureKeyDict = featureKeyDict 258 259 self.updateFeatureDict() 260 261 self.featureIDcompute() 262 print 'read', len(self), 'patterns' 263 264 if 'labelsFile' in args : 265 self.attachLabels(Labels(args['labelsFile'], **args)) 266 else : 267 self.attachLabels(Labels(L, patternID = patternID, **args))
268 269
270 - def fromArray(self, X, **args) :
271 272 L = None 273 patternID = None 274 self.featureID = None 275 if 'labels' in args : 276 L = args['labels'].L[:] 277 patternID = args['labels'].patternID[:] 278 if 'L' in args : 279 L = args['L'] 280 if 'patternID' in args : 281 patternID = args['patternID'][:] 282 if 'featureID' in args : 283 if self.__class__.__name__ == 'SparseDataSet' : 284 raise vluaeError, 'cannot set feature ID for SparseDataSet' 285 self.featureID = args['featureID'][:] 286 287 if L is not None : assert len(X) == len(L) 288 if self.featureID is None : 289 self.featureID = [str(i) for i in range(len(X[0]))] 290 if patternID is None : 291 patternID = [str(i) for i in range(1, len(X) + 1)] 292 293 self.fromArrayAdd(X) 294 self.updateFeatureDict() 295 self.featureIDcompute() 296 297 if 'labelsFile' in args : 298 self.attachLabels(Labels(args['labelsFile'], **args)) 299 else : 300 self.attachLabels(Labels(L, patternID=patternID))
301 302
303 - def fromArrayAdd(self, X) :
304 305 if type(X[0]) == dict : 306 featureHashDict = {} 307 for i in range(len(X)) : 308 for key in X[i] : 309 if hash(key) in featureHashDict : 310 if featureHashDict[hash(key)] != key : 311 raise valueError, 'hash clash' 312 else : 313 featureHashDict[hash(key)] = key 314 featureHashes = featureHashDict.keys() 315 featureHashes.sort() 316 self.featureID = [featureHashDict[key] for key in featureHashes] 317 self.initializeDataMatrix(len(X), len(self.featureID)) 318 for i in range(len(X)) : 319 x = {} 320 for key in X[i] : 321 x[hash(key)] = X[i][key] 322 self.addPattern(x, i) 323 else : 324 self.initializeDataMatrix(len(X), len(X[0])) 325 for i in range(len(X)) : 326 self.addPattern(X[i], i)
327
328 - def __repr__(self) :
329 330 rep = '<' + self.__class__.__name__ + ' instance>\n' 331 rep += 'number of patterns: ' + str(len(self)) +'\n' 332 if self.X is not None : 333 rep += 'number of features: ' + str(self.numFeatures) + '\n' 334 rep += self.labels.__repr__() 335 336 return rep
337
338 - def save(self, fileName, **args) :
339 """save a dataset to a file (does not use pickle!) 340 341 :Parameters: 342 - `fileName` - a file name or a file handle 343 344 :Keywords: 345 - `format` - 'csv' or 'sparse'; by default format is chosen by the 346 type of the dataset -- sparse containers save in sparse format 347 and non-sparse containers in csv format. 348 - `delimiter` - which delimiter to use when saving in csv format 349 - `patterns` - save only those patterns whose indices are given 350 - `ids` - save only those patterns whose pattern ID are given 351 - `sortByID` - whether to sort the lines according to the pattern ID 352 (default = False) 353 - `sortByLabel` - whether to sort the lines according to the class label 354 (default = False) 355 """ 356 357 print 'saving to ', fileName 358 if type(fileName) == type('') : 359 fileHandle = open(fileName, 'w') 360 else : 361 fileHandle = fileName 362 363 L = self.labels.L 364 365 if self.__class__.__name__.lower().find('sparse') >= 0 : 366 format = 'sparse' 367 else : 368 format = 'csv' 369 print format 370 if 'format' in args : 371 format = args['format'] 372 if 'delimiter' in args : 373 delim = args['delimiter'] 374 else : 375 delim = ',' 376 if 'patterns' in args : 377 patterns = args['patterns'] 378 else : 379 patterns = range(len(self)) 380 if 'ids' in args : 381 idDict = misc.list2dict(args['ids']) 382 patterns = [i for i in range(len(self)) 383 if self.labels.patternID[i] in idDict] 384 if 'sortByID' in args and args['sortByID'] : 385 ids = self.labels.patternID[:] 386 ids.sort() 387 idMap = misc.list2dict(self.labels.patternID, range(len(self))) 388 idDict = misc.list2dict(patterns) 389 patterns = [idMap[id] for id in ids 390 if idMap[id] in idDict] 391 if 'sortByLabel' in args and args['sortByLabel'] : 392 y = self.labels.Y[:] 393 patterns = numpy.argsort(self.labels.Y) 394 395 if format == 'csv' : 396 if L is None : 397 labels = '' 398 else : 399 labels = 'labels' + delim 400 fileHandle.write('#' + 'patternID' + delim + labels + 401 delim.join(self.featureID) + '\n') 402 for i in patterns : 403 x = self.getPattern(i) 404 if format == 'sparse' : 405 if self.labels.patternID is not None : 406 fileHandle.write(str(self.labels.patternID[i]) + ',') 407 if L is not None : 408 if type(L[i]) == type([]) : 409 fileHandle.write(';'.join(L[i]) + ' ') 410 else : 411 fileHandle.write(str(L[i]) + ' ') 412 if type(x) == type({}) : 413 tokens = [self.featureID[self.featureKeyDict[key]]+':'+ 414 str(x[key]) for key in x] 415 else : 416 tokens = [self.featureID[i] + ':' + str(x[i]) 417 for i in range(self.numFeatures) 418 if x[i] != 0] 419 fileHandle.write(' '.join(tokens) + '\n') 420 else : 421 if self.labels.patternID is not None : 422 fileHandle.write(str(self.labels.patternID[i]) + delim) 423 if L is not None : 424 if type(L[i]) == type([]) : 425 fileHandle.write(';'.join(L[i]) + delim) 426 else : 427 fileHandle.write(L[i] + delim) 428 if type(x) == type({}) : 429 tokens = [str(x.get(self.featureKey[i],0)) 430 for i in range(self.numFeatures)] 431 else : 432 tokens = [str(val) for val in x] 433 fileHandle.write(delim.join(tokens) + '\n') 434 fileHandle.close()
435
436 - def getMatrix(self) :
437 438 X = numpy.zeros((len(self), self.numFeatures), float) 439 for i in range(len(self)) : 440 X[i] = self.getPattern(i) 441 return X
442
443 - def extend(self, other, patterns = None) :
444 445 if self.__class__ != other.__class__ : 446 raise ValueError, 'datasets should be the same class' 447 448 if patterns is None : patterns = range(len(other)) 449 450 # first check if ids have compatible hash values : 451 for id in other.featureID : 452 if (hash(id) in self.featureKeyDict and 453 id != self.featureID[self.featureKeyDict[hash(id)]]) : 454 raise ValueError, 'bad hash' 455 456 # recompute featureID related stuff 457 self.featureKey = misc.union(self.featureKey, other.featureKey) 458 self.featureKey.sort() 459 self.featureKeyDict.clear() 460 for i in range(len(self.featureKey)) : 461 self.featureKeyDict[self.featureKey[i]] = i 462 featureIDs = misc.union(self.featureID, other.featureID) 463 self.featureID = [None for i in range(len(self.featureKey))] 464 for id in featureIDs : 465 self.featureID[self.featureKeyDict[hash(id)]] = id 466 467 self.extendX(other, patterns) 468 self.labels.extend(other.labels, patterns)
469 #self.attachLabels(self.labels.L) 470
471 - def keepFeatures(self, features) :
472 """eliminate all but the give list of features 473 INPUT: 474 features - a list of features to eliminate; these are either numbers 475 between 0 and numFeatures-1 (indices of features, not their IDs) or 476 featureIDs 477 """ 478 479 if type(features[0]) == type('') : 480 features = self.featureNames2IDs(features) 481 self.eliminateFeatures(misc.setminus(range(self.numFeatures), features))
482 483
484 - def featureNames2IDs(self, featureList) :
485 """convert a list of feature Names into their numeric IDs""" 486 487 return [self.featureDict[feature] for feature in featureList]
488
489 - def addFeatureKeyDict(self) :
490 491 self.featureKeyDict = {} 492 for i in range(len(self.featureID)) : 493 self.featureKeyDict[hash(self.featureID[i])] = i
494
495 -class WrapperDataSet (BaseDataSet) :
496 497 isWrapper = True 498
499 - def __len__(self) :
500 """the number of patterns in the dataset""" 501 502 return self.size()
503
504 - def getX(self) :
505 506 return None
507
508 - def setX(self, value) :
509 510 raise ValueError, 'X cannot be set'
511 512 X = property (getX, setX, None, 'X') 513
514 - def get_kernel(self) :
515 516 return self._kernel
517
518 - def set_kernel(self, value) :
519 520 raise ValueError, 'kernel cannot be set'
521 522 kernel = property (get_kernel, set_kernel, None, 'kernel') 523 524
525 - def attachLabels(self, labels) :
526 527 BaseDataSet.attachLabels(self, labels) 528 if hasattr(self.labels, 'Y') and self.labels.Y is not None : 529 for i in range(len(labels)) : 530 self.setY(i, labels.Y[i])
531 532 attachKernel = ker.attachKernel
533