Package PyML :: Package containers :: Module datafunc
[frames] | no frames]

Source Code for Module PyML.containers.datafunc

  1   
  2  import arrayWrap 
  3  import ker 
  4  from ext import ckernel 
  5  from ext import caggregate 
  6  from ext import csparsedataset 
  7  from ext import cvectordataset 
  8  from ext import ckerneldata 
  9  from ext import csequencedata 
 10  from ext import cstringkernel 
 11   
 12  import string 
 13  import numpy 
 14  import math 
 15  import copy 
 16  import random 
 17   
 18  import misc,myio 
 19  import parsers 
 20   
 21  """classes for reading and handling a dataset""" 
 22   
 23  __docformat__ = "restructuredtext en" 
 24   
 25   
26 -class PySparseDataSet (BaseVectorDataSet):
27 """A sparse dataset container""" 28
29 - def __len__(self) :
30 31 return len(self.X)
32
33 - def getNumFeatures(self) :
34 35 return len(self.featureID)
36
37 - def setNumFeatures(self, value) :
38 39 raise ValueError, 'do not call this function!'
40 41 numFeatures = property (getNumFeatures, setNumFeatures, 42 None, 'The number of features in a dataset') 43
44 - def copy(self, other, patternsToCopy, deepcopy) :
45 """copy the X variable of a sparse dataset 46 INPUT: 47 other - the other dataset 48 patternsToCopy - a list of patterns to copy 49 deepcopy - a 0/1 flag telling whether to do deepcopy or not""" 50 51 X = None 52 53 if patternsToCopy is None : 54 patternsToCopy = range(len(other)) 55 56 featureKeyDict = {} 57 if other.X is not None : 58 X = [] 59 for i in patternsToCopy: 60 if deepcopy : 61 X.append(copy.deepcopy(other.X[i])) 62 else : 63 X.append(other.X[i]) 64 if len(patternsToCopy) < len(other) : 65 for featureKey in other.X[i] : 66 featureKeyDict[featureKey] = 1 67 68 if len(patternsToCopy) == len(other) : 69 self.featureKeyDict = copy.deepcopy(other.featureKeyDict) 70 self.featureKey = other.featureKey[:] 71 self.featureID = other.featureID[:] 72 else : 73 self.featureKey = featureKeyDict.keys() 74 self.featureKey.sort() 75 self.featureKeyDict = {} 76 for i in range(len(self.featureKey)) : 77 self.featureKeyDict[self.featureKey[i]] = i 78 self.featureID = [other.featureID[i] for i in range(other.numFeatures) 79 if other.featureKey[i] in self.featureKeyDict] 80 81 self.X = X
82 #self._numFeatures = len(self.featureID) 83 84
85 - def initializeDataMatrix(self, numPatterns, numFeatures) :
86 87 self.X = []
88
89 - def addPattern(self, x, i) :
90 91 if type(x) == type({}) : 92 self.X.append(x) 93 else : 94 xDict = {} 95 for i in range(len(x)) : 96 xDict[i] = x[i] 97 self.X.append(xDict)
98
99 - def addFeature(self, id, values) :
100 101 hashID = hash(id) 102 if hashID in self.featureKeyDict : 103 raise ValueError, 'feature already exists, or hash problem' 104 for i in range(len(self)) : 105 if values[i] != 0 : 106 self.X[i][hashID] = values[i] 107 108 # update the featureKey, featureID attributes: 109 pos = numpy.searchsorted(self.featureKey, hashID) 110 self.featureKey.insert(pos, hashID) 111 self.featureID.insert(pos, id) 112 self.featureKeyDict = misc.list2dict(self.featureKey, range(len(self.featureKey)))
113 114
115 - def getPattern(self, i) :
116 117 return self.X[i]
118
119 - def featureIDcompute(self) :
120 121 pass
122
123 - def extendX(self, other, patterns) :
124 125 for p in patterns : 126 self.X.append(other.X[p])
127 128
129 - def eliminateFeatures(self, featureList):
130 """eliminate a list of features from a dataset 131 INPUT: 132 featureList - a list of features to eliminate; these are numbers 133 between 0 and numFeatures-1 (indices of features, not their IDs)""" 134 135 if len(featureList) == 0 : return 136 137 if self.verbose : 138 print 'eliminating features...' 139 140 if type(featureList[0]) == type('') : 141 featureList = self.featureNames2IDs(features) 142 143 elimDict = {} 144 for feature in featureList : 145 elimDict[self.featureKey[feature]] = 1 146 147 featureKeyDict = {} 148 for i in range(len(self)) : 149 if self.verbose and i % 1000 == 0 and i > 0 : 150 print i 151 featureKeys = self.X[i].keys() 152 for featureKey in featureKeys : 153 if featureKey in elimDict : 154 del self.X[i][featureKey] 155 else : 156 featureKeyDict[featureKey] = 1 157 158 oldFeatureKey = self.featureKey 159 self.featureKey = featureKeyDict.keys() 160 self.featureKey.sort() 161 self.featureKeyDict = {} 162 for i in range(len(self.featureKey)) : 163 self.featureKeyDict[self.featureKey[i]] = i 164 self.featureID = [self.featureID[i] for i in range(len(self.featureID)) 165 if oldFeatureKey[i] in self.featureKeyDict]
166 167
168 - def featureView(self) :
169 """F is a list where F[i] is a dictionary whose entries are the non 170 zero entries of feature number i: 171 F[self.featureKeyDict[f]][i] = X[i][f] 172 """ 173 174 F = [{} for i in range(self.numFeatures)] 175 176 for i in range(len(self)) : 177 for f in self.X[i].keys() : 178 F[self.featureKeyDict[f]][i] = self.X[i][f] 179 180 self.F = F
181 182
183 - def getFeature(self, feature, patterns = None) :
184 185 if patterns is None : 186 patterns = range(len(self)) 187 values = numpy.zeros(len(patterns), numpy.float_) 188 for i in range(len(patterns)) : 189 if self.featureKey[feature] in self.X[patterns[i]] : 190 values[i] = self.X[patterns[i]][self.featureKey[feature]] 191 192 return values
193 194
195 - def dotProduct(self, x, y, other = None) :
196 197 if type(x) == type(1) : 198 x = self.X[x] 199 if other is not None : 200 y = other.X[y] 201 else : 202 y = self.X[y] 203 sum = 0.0 204 xKeys = x.keys() 205 for xKey in xKeys : 206 if y.has_key(xKey) : 207 sum += y[xKey] * x[xKey] 208 return sum
209
210 - def norm(self, pattern, p = 1) :
211 212 sum = 0.0 213 for xKey in self.X[pattern] : 214 if p == 1 : 215 sum += abs(self.X[pattern][xKey]) 216 elif p == 2 : 217 sum += self.X[pattern][xKey] * self.X[pattern][xKey] 218 else : 219 raise ValueError, 'wrong value for p' 220 221 if p == 1 : 222 return sum 223 else : 224 return math.sqrt(sum)
225
226 - def normalize(self, p = 1) :
227 """normalize dataset according to the p-norm, p=1,2""" 228 229 for i in range(len(self)) : 230 norm = self.norm(i, p) 231 if norm == 0 : continue 232 for xKey in self.X[i] : 233 self.X[i][xKey] /= norm
234 235
236 - def scale(self, w) :
237 """rescale the columns of the data matrix by a weight vector w: 238 set X[i][j] = X[i][j] * w[j] 239 w is either a dictionary or an array 240 """ 241 242 if type(w) != type({}) : 243 wDict = {} 244 for i in range(self.numFeatures) : 245 wDict[self.featureKey[i]] = w[i] 246 w = wDict 247 for i in range(len(self)) : 248 for featureKey in self.X[i] : 249 if featureKey in w : 250 self.X[i][featureKey] *= w[featureKey] 251 else : 252 self.X[i][featureKey] = 0.0
253 254
255 - def mean(self, patterns = None) :
256 257 if patterns is None : patterns = range(len(self)) 258 259 featureMean = numpy.zeros(self.numFeatures, numpy.float_) 260 261 for i in patterns : 262 for featureKey in self.X[i] : 263 featureMean[self.featureKeyDict[featureKey]] += self.X[i][featureKey] 264 265 return featureMean / len(patterns)
266
267 - def translate(self, translation) :
268 """subtract the input array from the data. 269 the sparsity of the data is not altered, ie, zero entries are not 270 made nonzero by the translation 271 """ 272 for i in range(len(self)) : 273 for featureKey in self.X[i] : 274 self.X[i][featureKey] -= translation[self.featureKeyDict[featureKey]]
275
276 - def std(self, patterns = None) :
277 278 if patterns is None : patterns = range(len(self)) 279 280 featureSq = numpy.zeros(self.numFeatures, numpy.float_) 281 282 for i in patterns : 283 for featureKey in self.X[i] : 284 featureSq[self.featureKeyDict[featureKey]] += self.X[i][featureKey]**2 285 286 featureVar = featureSq / float(len(patterns)) - self.mean(patterns)**2 287 288 return numpy.sqrt(numpy.clip(featureVar, 0, 1e10))
289
290 - def featureCount(self, feature, patterns = None) :
291 292 if patterns is None : 293 patterns = range(len(self)) 294 295 count = 0 296 featureKey = self.featureKey[feature] 297 for i in patterns : 298 if data.X[i].has_key(featureKey) and data.X[i][featureKey] != 0 : 299 count += 1 300 301 return count
302
303 - def featureCounts(self, patterns = None) :
304 305 if patterns is None : 306 patterns = range(len(self)) 307 308 counts = numpy.zeros(self.numFeatures, numpy.float_) 309 for i in patterns : 310 for featureKey in data.X[i] : 311 feature = data.featureKeyDict[featureKey] 312 if data.X[i][featureKey] != 0 : 313 counts[feature] += 1 314 315 return counts
316 317 318 319
320 -class PyVectorDataSet (BaseVectorDataSet) :
321 """A non-sparse dataset container; uses a numpy array""" 322
323 - def __len__(self) :
324 """the number of patterns in the dataset""" 325 326 if self.X is not None : 327 return len(self.X) 328 else : 329 raise ValueError, "no data here!"
330
331 - def getNumFeatures(self) :
332 333 return len(self.featureID)
334
335 - def setNumFeatures(self, value) :
336 337 raise ValueError, 'do not call this function!'
338 339 numFeatures = property (getNumFeatures, setNumFeatures, 340 None, 'The number of features in a dataset') 341
342 - def fromArrayAdd(self, X) :
343 344 self.X = X
345
346 - def dotProduct(self, x, y, other = None) :
347 348 if type(x) == type(1) : 349 x = self.X[x] 350 if other is not None : 351 y = other.X[y] 352 else : 353 y = self.X[y] 354 355 return numpy.dot(x, y)
356
357 - def initializeDataMatrix(self, numPatterns, numFeatures) :
358 359 self.X = numpy.zeros((numPatterns, numFeatures), numpy.float_)
360
361 - def addPattern(self, x, i) :
362 363 for j in range(len(x)) : 364 self.X[i][j] = x[j]
365
366 - def getPattern(self, i) :
367 368 return self.X[i]
369
370 - def extendX(self, other, patterns) :
371 372 X = self.X 373 self.X = numpy.zeros((len(self) + len(patterns), len(self.numFeatures)), 374 numpy.float_) 375 for i in range(len(X)) : 376 self.X[i] = X[i] 377 for i in patterns : 378 self.X[i + len(X)] = other.X[i]
379
380 - def featureIDcompute(self) :
381 382 pass
383
384 - def copy(self, other, patternsToCopy, deepcopy) :
385 """deepcopy is performed by default, so the deepcopy flag is ignored""" 386 387 X = None 388 K = None 389 numFeatures = None 390 if patternsToCopy is None : 391 patternsToCopy = range(len(other)) 392 else : 393 # keep track of the original IDs of the patterns: 394 if hasattr(other, 'origID') : 395 self.origID = [other.origID[p] for p in patternsToCopy] 396 else : 397 self.origID = patternsToCopy[:] 398 399 if other.X is not None : 400 numFeatures = other.numFeatures 401 X = numpy.take(other.X, patternsToCopy) 402 403 self.X = X 404 self.featureID = other.featureID[:] 405 self.featureKey = other.featureKey[:] 406 self.featureKeyDict = copy.deepcopy(other.featureKeyDict)
407 408 #self._numFeatures = numFeatures 409
410 - def eliminateFeatures(self, featureList) :
411 """eliminate a list of features from a dataset 412 Input: 413 featureList - a list of features to eliminate; these are numbers 414 between 0 and numFeatures-1 (indices of features, not their IDs)""" 415 416 if len(featureList) == 0 : return 417 if type(featureList[0]) == type('') : 418 featureList = self.featureNames2IDs(features) 419 featuresToTake = misc.setminus(range(self.numFeatures), featureList) 420 featuresToTake.sort() 421 self.featureID = [self.featureID[i] for i in featuresToTake] 422 self.featureKey = [self.featureKey[i] for i in featuresToTake] 423 self.featureKeyDict = {} 424 for i in range(len(self.featureKey)) : 425 self.featureKeyDict[self.featureKey[i]] = i 426 427 self.X = numpy.take(self.X, featuresToTake, 1)
428 #self._numFeatures -= len(featureList) 429 430
431 - def getFeature(self, feature, patterns = None) :
432 433 if patterns is None : 434 patterns = range(len(self)) 435 values = numpy.zeros(len(patterns), numpy.float_) 436 for i in range(len(patterns)) : 437 values[i] = self.X[i][feature] 438 439 return values
440
441 - def norm(self, pattern, p = 1) :
442 443 if p == 1 : 444 return numpy.sum(numpy.absolute(self.X[pattern])) 445 elif p == 2 : 446 return math.sqrt(numpy.sum(numpy.dot(self.X[pattern]))) 447 else : 448 raise ValueError, 'wrong value of p'
449
450 - def normalize(self, p = 1) :
451 """normalize dataset according to the p-norm, p=1,2""" 452 453 for i in range(len(self)) : 454 norm = self.norm(i, p) 455 if norm == 0 : continue 456 self.X[i] = self.X[i] / norm
457
458 - def scale(self, w) :
459 """rescale the columns of the data matrix by a weight vector w: 460 set X[i][j] = X[i][j] / w[j] 461 """ 462 463 self.X = self.X * w
464
465 - def translate(self, c) :
466 467 self.X = self.X - numpy.resize(c, (len(self), len(c)))
468
469 - def mean(self, patterns = None) :
470 471 if patterns is None or len(patterns) == len(self) : 472 return numpy.mean(self.X) 473 474 featureMean = numpy.zeros(self.numFeatures, numpy.float_) 475 476 for i in patterns : 477 featureMean += self.X[i] 478 479 return featureMean / len(patterns)
480 481
482 - def std(self, patterns = None) :
483 484 if patterns is None or len(patterns) == len(self) : 485 return numpy.std(self.X) * len(self) / (len(self) - 1) 486 487 featureSq = numpy.zeros(self.numFeatures, numpy.float_) 488 489 for i in patterns : 490 featureSq += self.X[i]**2 491 492 featureVar = featureSq / float(len(patterns)) - self.mean(patterns)**2 493 494 return numpy.sqrt(numpy.clip(featureVar, 0, 1e10))
495
496 - def featureCount(self, feature, patterns = None) :
497 498 if patterns is None : 499 patterns = range(len(self)) 500 501 count = 0 502 for p in patterns : 503 if data.X[p][feature] != 0 : count+=1 504 505 return count
506
507 - def featureCounts(self, patterns = None) :
508 509 if patterns is None : 510 patterns = range(len(self)) 511 512 counts = numpy.zeros(self.numFeatures) 513 for i in patterns : 514 counts += numpy.not_equal(data.X[i], 0) 515 516 return counts
517
518 - def csvwrite(self, fileName, delim = ' ', idCol = -1) :
519 520 fileHandle = open(fileName, 'w') 521 if self.labels.numClasses == 2 : 522 Y = [self.labels.Y[i] * 2 - 1 for i in range(len(self))] 523 else : 524 Y = self.labels.Y 525 526 for i in range(len(self)) : 527 outstr = '' 528 for j in range(self.numFeatures) : 529 outstr += str(self.X[i][j]) + delim 530 fileHandle.write(outstr + str(Y[i]) + '\n') 531 fileHandle.close()
532