1 import copy
2 import numpy
3
4 from PyML.containers import ker
5 from PyML.containers import parsers
6 from PyML.utils import misc
7 from PyML.containers.labels import Labels
8
10 """
11 A base class for PyML dataset containers
12
13 """
14 type = 'dataset'
15 isVector = False
16
18
19 self.isTrained = False
20 self.isTested = False
21
23
24 assert func is None or type(func).__name__ == 'function'
25 self._trainingFunc = func
26
28
29 if hasattr(self, '_trainingFunc') :
30 return self._trainingFunc
31 else :
32 return None
33
34 trainingFunc = property(getTrainingFunc, setTrainingFunc,
35 None, '_trainingFunc')
36
38
39 assert func is None or type(func).__name__ == 'function'
40 self._testingFunc = func
41
43
44 if hasattr(self, '_testingFunc') :
45 return self._testingFunc
46 else :
47 return None
48
49 testingFunc = property(getTestingFunc, setTestingFunc,
50 None, '_testingFunc')
51
52 - def train(self, **args) :
53
54 if self.trainingFunc is not None and not self.isTrained :
55 self.trainingFunc(self, **args)
56 self.isTrained = True
57
58 - def test(self, trainingData, **args) :
59
60 if self.testingFunc is not None and not self.isTested :
61 self.testingFunc(self, trainingData, **args)
62 self.isTested = True
63
65
66 if not hasattr(self, '_registeredAttributes') :
67 self._registeredAttributes = [attributeName]
68 else :
69 self._registeredAttributes.append(attributeName)
70 if attributeValue is not None :
71 setattr(self, attributeName, attributeValue)
72 if not hasattr(self, '_actions') : self._actions = {}
73 self._actions[attributeName] = action
74
76
77 forgetClassLabels = False
78 if "patterns" in args:
79 patterns = args['patterns']
80
81 if type(patterns[0]) == type('') :
82 idDict = misc.list2dict(patterns)
83 patternsToCopy = [i for i in range(len(other))
84 if other.labels.patternID[i] in idDict]
85 else :
86 patternsToCopy = patterns
87 elif "classes" in args :
88 patternsToCopy = [i for i in range(len(other))
89 if other.labels.L[i] in args["classes"]]
90 forgetClassLabels = True
91 elif "classID" in args :
92 patternsToCopy = [i for i in range(len(other))
93 if other.labels.Y[i] in args["classID"]]
94 forgetClassLabels = True
95 else :
96 patternsToCopy = range(len(other))
97
98 self.setTrainingFunc(other.trainingFunc)
99 self.setTestingFunc(other.testingFunc)
100
101 deepcopy = True
102 if 'deepcopy' in args : deepcopy = args['deepcopy']
103
104 self.copy(other, patternsToCopy, deepcopy)
105
106 self.attachKernel(other)
107 self.attachLabels(Labels(other.labels,
108 patterns = patternsToCopy,
109 forgetClassLabels = forgetClassLabels))
110
111
112 if hasattr(other, '_registeredAttributes') :
113 self._registeredAttributes = other._registeredAttributes[:]
114 self._actions = copy.deepcopy(other._actions)
115 for attr in self._registeredAttributes :
116 a = getattr(other, attr)
117 if type(a) == type([]) :
118 if len(a) != len(other) :
119 raise ValueError, 'attribute has bad length'
120
121
122 setattr(self, attr, [a[i] for i in patternsToCopy])
123 elif hasattr(a, 'type') and a.type == 'dataset' and len(a) == len(self) :
124 acopy = a.__class__(a, patterns = patternsToCopy)
125 setattr(self, attr, acopy)
126 else :
127 setattr(self, attr, a)
128
129 - def copy(self, other, patterns, deepcopy) :
130 """
131 Each class that wants to use the generic copy constructor needs
132 to define this function for doing class-specific copying"""
133
134 raise NotImplementedError
135
137 """
138 returns the kernel matrix as a numpy array
139 """
140
141 kvec = self.getKernelMatrixAsVector()
142 return numpy.reshape(kvec, (len(self), len(self)))
143
162
164
165 if labels.__class__.__name__ == 'Labels' :
166 pass
167 elif type(labels) == type('') :
168 labels = Labels(labels)
169 else :
170 raise ValueError, 'wrong type of labels object'
171 if len(self) != len(labels) :
172 raise ValueError, 'length of labels not equal length of self'
173 self.labels = labels
174
175
177 """A base class for vector dataset container classes
178
179 Construction::
180
181 DataSet(fileName) - read data from a file
182 DataSet(fileName, classes = listOfClasses) - read only the
183 classes that are named in listOfClasses
184 DataSet(otherDataSet) - copy construction
185 DataSet(otherDataSet, patterns = listOfPatterns) - copy construction
186 using a list of patterns to copy
187 DataSet(otherDataSet, classes = classesToCopy) - copy construction
188 using a list of classes to copy
189
190 Keywords::
191
192 deepcopy - whether to deepcopy a dataset (default = True)
193 The only container that implements a shallow copy is the SparseDataSet.
194
195 Usage/attributes::
196
197 len(dataset) - the number of patterns
198 numFeatures - the number of features in the data (when applicable)
199 """
200
201 isVector = True
202 verbose = 1
203
227
228
230
231 parser = parsers.parserDispatcher(fileName, **args)
232
233 if parser.__class__.__name__ == 'SparseParser' and \
234 self.__class__.__name__ == 'DataSet' :
235 raise ValueError, \
236 'cannot use a DataSet container with a sparse file'
237 parser.scan()
238
239 self.initializeDataMatrix(len(parser), len(parser._featureID))
240
241
242 i = 0
243 for x in parser :
244 self.addPattern(x, i)
245 i += 1
246 if i % 100 == 0 :
247 print 'read',i,'patterns'
248
249
250 L = parser._labels
251 patternID = parser._patternID
252 if patternID is None or len(patternID) == 0 :
253 patternID = [str(i) for i in range(1, len(self) + 1)]
254 self.featureID, featureKey, featureKeyDict = parser.postProcess()
255 if self.__class__.__name__ == 'PySparseDataSet' :
256 self.featureKey = featureKey
257 self.featureKeyDict = featureKeyDict
258
259 self.updateFeatureDict()
260
261 self.featureIDcompute()
262 print 'read', len(self), 'patterns'
263
264 if 'labelsFile' in args :
265 self.attachLabels(Labels(args['labelsFile'], **args))
266 else :
267 self.attachLabels(Labels(L, patternID = patternID, **args))
268
269
271
272 L = None
273 patternID = None
274 self.featureID = None
275 if 'labels' in args :
276 L = args['labels'].L[:]
277 patternID = args['labels'].patternID[:]
278 if 'L' in args :
279 L = args['L']
280 if 'patternID' in args :
281 patternID = args['patternID'][:]
282 if 'featureID' in args :
283 if self.__class__.__name__ == 'SparseDataSet' :
284 raise vluaeError, 'cannot set feature ID for SparseDataSet'
285 self.featureID = args['featureID'][:]
286
287 if L is not None : assert len(X) == len(L)
288 if self.featureID is None :
289 self.featureID = [str(i) for i in range(len(X[0]))]
290 if patternID is None :
291 patternID = [str(i) for i in range(1, len(X) + 1)]
292
293 self.fromArrayAdd(X)
294 self.updateFeatureDict()
295 self.featureIDcompute()
296
297 if 'labelsFile' in args :
298 self.attachLabels(Labels(args['labelsFile'], **args))
299 else :
300 self.attachLabels(Labels(L, patternID=patternID))
301
302
304
305 if type(X[0]) == dict :
306 featureHashDict = {}
307 for i in range(len(X)) :
308 for key in X[i] :
309 if hash(key) in featureHashDict :
310 if featureHashDict[hash(key)] != key :
311 raise valueError, 'hash clash'
312 else :
313 featureHashDict[hash(key)] = key
314 featureHashes = featureHashDict.keys()
315 featureHashes.sort()
316 self.featureID = [featureHashDict[key] for key in featureHashes]
317 self.initializeDataMatrix(len(X), len(self.featureID))
318 for i in range(len(X)) :
319 x = {}
320 for key in X[i] :
321 x[hash(key)] = X[i][key]
322 self.addPattern(x, i)
323 else :
324 self.initializeDataMatrix(len(X), len(X[0]))
325 for i in range(len(X)) :
326 self.addPattern(X[i], i)
327
329
330 rep = '<' + self.__class__.__name__ + ' instance>\n'
331 rep += 'number of patterns: ' + str(len(self)) +'\n'
332 if self.X is not None :
333 rep += 'number of features: ' + str(self.numFeatures) + '\n'
334 rep += self.labels.__repr__()
335
336 return rep
337
338 - def save(self, fileName, **args) :
339 """save a dataset to a file (does not use pickle!)
340
341 :Parameters:
342 - `fileName` - a file name or a file handle
343
344 :Keywords:
345 - `format` - 'csv' or 'sparse'; by default format is chosen by the
346 type of the dataset -- sparse containers save in sparse format
347 and non-sparse containers in csv format.
348 - `delimiter` - which delimiter to use when saving in csv format
349 - `patterns` - save only those patterns whose indices are given
350 - `ids` - save only those patterns whose pattern ID are given
351 - `sortByID` - whether to sort the lines according to the pattern ID
352 (default = False)
353 - `sortByLabel` - whether to sort the lines according to the class label
354 (default = False)
355 """
356
357 print 'saving to ', fileName
358 if type(fileName) == type('') :
359 fileHandle = open(fileName, 'w')
360 else :
361 fileHandle = fileName
362
363 L = self.labels.L
364
365 if self.__class__.__name__.lower().find('sparse') >= 0 :
366 format = 'sparse'
367 else :
368 format = 'csv'
369 print format
370 if 'format' in args :
371 format = args['format']
372 if 'delimiter' in args :
373 delim = args['delimiter']
374 else :
375 delim = ','
376 if 'patterns' in args :
377 patterns = args['patterns']
378 else :
379 patterns = range(len(self))
380 if 'ids' in args :
381 idDict = misc.list2dict(args['ids'])
382 patterns = [i for i in range(len(self))
383 if self.labels.patternID[i] in idDict]
384 if 'sortByID' in args and args['sortByID'] :
385 ids = self.labels.patternID[:]
386 ids.sort()
387 idMap = misc.list2dict(self.labels.patternID, range(len(self)))
388 idDict = misc.list2dict(patterns)
389 patterns = [idMap[id] for id in ids
390 if idMap[id] in idDict]
391 if 'sortByLabel' in args and args['sortByLabel'] :
392 y = self.labels.Y[:]
393 patterns = numpy.argsort(self.labels.Y)
394
395 if format == 'csv' :
396 if L is None :
397 labels = ''
398 else :
399 labels = 'labels' + delim
400 fileHandle.write('#' + 'patternID' + delim + labels +
401 delim.join(self.featureID) + '\n')
402 for i in patterns :
403 x = self.getPattern(i)
404 if format == 'sparse' :
405 if self.labels.patternID is not None :
406 fileHandle.write(str(self.labels.patternID[i]) + ',')
407 if L is not None :
408 if type(L[i]) == type([]) :
409 fileHandle.write(';'.join(L[i]) + ' ')
410 else :
411 fileHandle.write(str(L[i]) + ' ')
412 if type(x) == type({}) :
413 tokens = [self.featureID[self.featureKeyDict[key]]+':'+
414 str(x[key]) for key in x]
415 else :
416 tokens = [self.featureID[i] + ':' + str(x[i])
417 for i in range(self.numFeatures)
418 if x[i] != 0]
419 fileHandle.write(' '.join(tokens) + '\n')
420 else :
421 if self.labels.patternID is not None :
422 fileHandle.write(str(self.labels.patternID[i]) + delim)
423 if L is not None :
424 if type(L[i]) == type([]) :
425 fileHandle.write(';'.join(L[i]) + delim)
426 else :
427 fileHandle.write(L[i] + delim)
428 if type(x) == type({}) :
429 tokens = [str(x.get(self.featureKey[i],0))
430 for i in range(self.numFeatures)]
431 else :
432 tokens = [str(val) for val in x]
433 fileHandle.write(delim.join(tokens) + '\n')
434 fileHandle.close()
435
437
438 X = numpy.zeros((len(self), self.numFeatures), float)
439 for i in range(len(self)) :
440 X[i] = self.getPattern(i)
441 return X
442
443 - def extend(self, other, patterns = None) :
444
445 if self.__class__ != other.__class__ :
446 raise ValueError, 'datasets should be the same class'
447
448 if patterns is None : patterns = range(len(other))
449
450
451 for id in other.featureID :
452 if (hash(id) in self.featureKeyDict and
453 id != self.featureID[self.featureKeyDict[hash(id)]]) :
454 raise ValueError, 'bad hash'
455
456
457 self.featureKey = misc.union(self.featureKey, other.featureKey)
458 self.featureKey.sort()
459 self.featureKeyDict.clear()
460 for i in range(len(self.featureKey)) :
461 self.featureKeyDict[self.featureKey[i]] = i
462 featureIDs = misc.union(self.featureID, other.featureID)
463 self.featureID = [None for i in range(len(self.featureKey))]
464 for id in featureIDs :
465 self.featureID[self.featureKeyDict[hash(id)]] = id
466
467 self.extendX(other, patterns)
468 self.labels.extend(other.labels, patterns)
469
470
472 """eliminate all but the give list of features
473 INPUT:
474 features - a list of features to eliminate; these are either numbers
475 between 0 and numFeatures-1 (indices of features, not their IDs) or
476 featureIDs
477 """
478
479 if type(features[0]) == type('') :
480 features = self.featureNames2IDs(features)
481 self.eliminateFeatures(misc.setminus(range(self.numFeatures), features))
482
483
485 """convert a list of feature Names into their numeric IDs"""
486
487 return [self.featureDict[feature] for feature in featureList]
488
490
491 self.featureKeyDict = {}
492 for i in range(len(self.featureID)) :
493 self.featureKeyDict[hash(self.featureID[i])] = i
494
496
497 isWrapper = True
498
500 """the number of patterns in the dataset"""
501
502 return self.size()
503
507
508 - def setX(self, value) :
509
510 raise ValueError, 'X cannot be set'
511
512 X = property (getX, setX, None, 'X')
513
515
516 return self._kernel
517
519
520 raise ValueError, 'kernel cannot be set'
521
522 kernel = property (get_kernel, set_kernel, None, 'kernel')
523
524
531
532 attachKernel = ker.attachKernel
533