Package PyML :: Package feature_selection :: Module featsel
[frames] | no frames]

Source Code for Module PyML.feature_selection.featsel

   1   
   2  from PyML.containers import labels 
   3  from PyML.classifiers import svm 
   4  from PyML.containers import ker 
   5  from PyML.utils import myio, misc 
   6  from PyML.evaluators import assess 
   7   
   8  import numpy 
   9  import random 
  10   
  11  '''classes for performing feature selection''' 
  12   
  13  __docformat__ = "restructuredtext en" 
  14   
  15   
16 -class FeatureSelector (object) :
17 18 '''api for feature selection objects''' 19 20 type = 'featureSelector' 21
22 - def select(self, data, *options, **args) :
23 """ 24 invokes ``selectFeatures`` to find predictive features and eliminates 25 the rest of the features from the input dataset 26 """ 27 28 features = self.selectFeatures(data, *options, **args) 29 print '*** number of features: *** ', len(features) 30 data.keepFeatures(features)
31
32 - def selectFeatures(self, data, *options, **args) :
33 """ 34 :Returns: 35 a list of predictive features 36 """ 37 raise NotImplementedError
38
39 - def score(self, data, **args) :
40 """ 41 :Returns: 42 a score for each feature in the input dataset 43 """ 44 raise NotImplementedError
45
46 - def rank(self, data, **args) :
47 """ 48 :Returns: 49 a ranking of the features in the dataset by converting the scores 50 to ranks 51 """ 52 scores = self.score(data, **args) 53 54 return weights2ranks(scores, data)
55
56 - def test(self, data, *options, **args) :
57 58 pass
59 60 train = select
61
62 -class OneAgainstRestSelect (FeatureSelector) :
63 '''Use a two-class feature selection method for multi-class problem 64 by doing feature selection in a one-against-the-rest manner, and 65 returns the union of all the features selected. 66 67 Construction:: 68 69 OneAgainstRestSelect(featureSelector) -- featureSelector is either 70 a OneAgainstRestSelect object for copy construction, or a featureSelector 71 object 72 ''' 73
74 - def __init__(self, featureSelector) :
75 76 if (not hasattr(featureSelector, 'type') or 77 featureSelector.type != 'featureSelector') : 78 raise ValueError, 'need a feature selector as input' 79 80 if featureSelector.__class__ == self.__class__ : 81 self.featureSelector = featureSelector.featureSelector.__class__( 82 featureSelector.featureSelector) 83 else : 84 self.featureSelector = featureSelector.__class__(featureSelector)
85
86 - def selectFeatures(self, data, *options, **args) :
87 88 labels = data.labels 89 90 features = [] 91 for k in range(data.labels.numClasses) : 92 data2 = labels.oneAgainstRest(data, k) 93 features2 = self.featureSelector.selectFeatures(data2) 94 features = misc.union(features, features2) 95 data.attachLabels(labels) 96 97 return features
98 99
100 -class RFE (FeatureSelector) :
101 102 ''' 103 RFE (Recursive Feature Elimination) uses the vector *w* of an SVM for 104 feature selection. 105 106 The method alternates between training a linear SVM and removing the features 107 with the smallest value of the weight vector. 108 109 You can either choose the number of features or let RFE choose the number 110 of features automatically; this is chosen as the minimal number of features 111 such that the number of support vectors is within one standard deviation 112 from the minimum number of support vectors. 113 114 Reference: 115 116 I. Guyon and J. Weston and S. Barnhill and V. Vapnik 117 Gene selection for cancer classification using support vector machines. 118 Machine Learning 46:389-422, 2002. 119 120 ''' 121
122 - def initialize(self, data) :
123 124 self.data = data.__class__(data, deepcopy = 1) 125 if self.selectNumFeatures : 126 self.featureLists = [data.featureID] 127 #self.features = range(data.numFeatures) 128 129 #self.featureLists = [] 130 self.wList = [] 131 self.numSV = []
132
133 - def __init__(self, arg = None, **settings) :
134 135 """ 136 :Keywords: 137 - `targetNum` - perform backward elimination until this many features are 138 left 139 - `mode` - values - 'byFraction' or 'byNum' (default = 'byFraction') 140 - `numToEliminate` - specifies the number of features to eliminate at each 141 iteration in the byNum mode 142 - `fractionToEliminate` - the fraction of features to eliminate at each 143 iteration in the byFraction mode (default = 0.05) 144 - `autoSelect` [False] - whether the number of features should be chosen 145 automatically 146 - `useScore` - whether to modulate the vector w by the golub coefficient 147 as in RSVM 148 149 """ 150 151 self.selectNumFeatures = True 152 self.fractionToEliminate = 0.05 153 self.numToEliminate = 10 154 self.mode = 'byFraction' #values: byFraction or byNumber 155 self.numFeatures = 20 156 self.featureScore = FeatureScore('golub') 157 self.useScore = False 158 self.rankFeatures = False 159 160 if arg is None : 161 self.svm = svm.SVM() 162 elif arg.__class__ == self.__class__ : 163 other = arg 164 self.fractionToEliminate = other.fractionToEliminate 165 self.numToEliminate = other.numToEliminate 166 self.mode = other.mode 167 self.numFeatures = other.numFeatures 168 self.selectNumFeatures = other.selectNumFeatures 169 self.useScore = other.useScore 170 self.svm = other.svm.__class__(other.svm) 171 elif arg.__class__.__name__ == 'SVM' : 172 self.svm = arg.__class__(arg) 173 else : 174 raise ValueError, 'unknown type of argument for RFE ' + str(arg) 175 176 if 'mode' in settings : 177 self.mode = settings['mode'] 178 if 'numToEliminate' in settings : 179 self.numToEliminate = settings['numToEliminate'] 180 if 'numFeatures' in settings : 181 self.numFeatures = settings['numFeatures'] 182 if 'fractionToEliminate' in settings : 183 self.fractionToEliminate = settings['fractionToEliminate'] 184 if 'autoSelect' in settings : 185 self.selectNumFeatures = settings['autoSelect'] 186 if 'useScore' in settings : 187 self.useScore = settings['useScore']
188
189 - def __repr__(self) :
190 rep = '<' + self.__class__.__name__ + ' instance>\n' 191 rep += 'mode: ' + self.mode + '\n' 192 if self.mode == "byNum" : 193 rep += 'number of features to eliminate each iteration : %d\n' \ 194 % self.numToEliminate 195 elif self.mode == "byFraction" : 196 rep += 'Fraction to eliminate each iteration : %f\n' \ 197 % self.fractionToEliminate 198 rep += 'target number of features : %d\n' % self.numFeatures 199 rep += 'automatic selection of the number of features : %d' % \ 200 self.selectNumFeatures 201 202 203 return rep
204
205 - def __iter__(self) :
206 207 return self
208
209 - def getFeatures(self, w, numFeatures) :
210 211 if self.mode == 'byNumber' : 212 numToElim = min(self.numToEliminate, 213 numFeatures - self.numFeatures) 214 elif self.mode == 'byFraction' : 215 numToElim = min(int(self.fractionToEliminate * len(w)), 216 numFeatures - self.numFeatures) 217 else : 218 raise ValueError, 'invalid elimination mode' 219 220 if numToElim == 0: numToElim = 1 221 print 'numFeaturesToEliminate: ', numToElim 222 223 if type(w) == type({}) : 224 w2 = numpy.zeros(numFeatures, numpy.float) 225 for wKey in w.keys(): 226 w2[wKey] = w[wKey] 227 w = w2 228 229 w = numpy.absolute(w) 230 231 if self.useScore : 232 w = w * self.featureScore.score(self.data) 233 234 numZero = numpy.sum(numpy.equal(w, 0)) 235 if numZero > numToElim : numToElim = numZero 236 237 I = numpy.argsort(w) 238 featuresToEliminate = I[:numToElim] 239 240 self.features = I[numToElim:] 241 self.w = w 242 243 return featuresToEliminate
244 245
246 - def next(self) :
247 248 data = self.data 249 250 if data.numFeatures <= self.numFeatures : 251 raise StopIteration 252 253 self.svm.train(data) 254 255 #self.wList.append(self.svm.model.w) 256 self.numSV.append(self.svm.model.numSV) 257 258 featuresToEliminate = self.getFeatures(self.svm.model.warray, 259 data.numFeatures) 260 if self.rankFeatures : 261 if len(self.weights) == 0 : 262 maxWeight = 0 263 else : 264 maxWeight = max(self.weights.values()) 265 for feature in featuresToEliminate : 266 self.weights[data.featureID[feature]] = self.w[feature] + maxWeight 267 268 data.eliminateFeatures(featuresToEliminate) 269 print '** numFeatures: ', data.numFeatures 270 271 if self.selectNumFeatures : 272 self.featureLists.append(data.featureID)
273 274
275 - def run(self, data, *options, **args) :
276 277 if data.labels.numClasses != 2 : 278 raise ValueError, 'RFE supports only two class problems' 279 280 self.initialize(data) 281 features = data.featureID[:] 282 283 rfeIter = iter(self) 284 for f in rfeIter : pass 285 286 if self.selectNumFeatures : 287 #minNumSV = min(self.numSV) + \ 288 # MLab.std(numpy.take(self.numSV, numpy.nonzero( 289 # numpy.less(self.numSV, numpy.average (self.numSV))))) 290 #minNumSV = min(self.numSV) + 2 291 minNumSV = len(self.data) + 1 292 #for i in range(len(self.numSV)-1, -1, -1) : 293 for i in range(len(self.numSV)) : 294 print 'numSV', self.numSV[i], minNumSV 295 if self.numSV[i] < minNumSV : 296 minNumSV = self.numSV[i] 297 features = self.featureLists[i] 298 #print 'optimal number of features',len(features) 299 #print features 300 self.features = data.featureNames2IDs(features)
301 302
303 - def selectFeatures(self, data, *options, **args):
304 305 self.run(data, *options, **args) 306 307 return self.features
308
309 - def rank(self, data, *options, **args):
310 311 self.rankFeatures = True 312 self.weights = {} 313 314 self.run(data, *options, **args) 315 316 # add the weights from the features that remain: 317 if len(self.weights) == 0 : 318 maxWeight = 0 319 else : 320 maxWeight = max(self.weights.values()) 321 print data.numFeatures 322 for feature in range(self.data.numFeatures) : 323 self.weights[self.data.featureID[feature]] = self.w[feature] + maxWeight 324 325 weights = [self.weights[data.featureID[i]] 326 for i in range(data.numFeatures)] 327 I = numpy.argsort(weights) 328 329 #ranks = [data.featureID[i] for i in I] 330 331 return weights2ranks(weights, data)
332 333
334 -class MultiplicativeUpdate (FeatureSelector) :
335 '''Multiplicative update uses the vector w of an SVM to do feature selection. 336 At each iteration an svm is trained and the data is multiplied by the 337 weight vector of the classifier. 338 339 Reference: 340 341 J. Weston, A. Elisseeff, M. Tipping and B. Scholkopf. 342 Use of the zero norm with linear models and kernel methods. 343 JMLR special Issue on Variable and Feature selection, 2002. 344 ''' 345 346
347 - def __init__(self, arg = None, **settings) :
348 349 self.eps = 0.01 350 self.rankFeatures = False 351 352 if arg.__class__ == self.__class__ : 353 other = arg 354 self.eps = other.eps 355 self.rankFeatures = other.rankFeatures 356 elif arg.__class__.__name__ == 'SVM' : 357 self.svm = arg.__class__(arg) 358 359 if 'eps' in settings : 360 self.eps = settings['eps']
361 362
363 - def __repr__(self) :
364 rep = '<' + self.__class__.__name__ + ' instance>\n' 365 rep += 'epsilon : %d\n' % self.eps 366 367 return rep
368
369 - def __iter__(self) :
370 371 return self
372
373 - def initialize(self, data) :
374 375 self.scaleData = data.__class__(data, deepcopy = True) 376 if not linearlySeparable (data) : 377 print 'not linearly separable!!!!!!!!!!!!!!!!!!!!!!' 378 self.svm = svm.SVM(ker.LinearRidge()) 379 else : 380 self.svm = svm.SVM() 381 print 'linearly separable**************************' 382 self.svm.C = 1000
383 384
385 - def next(self) :
386 387 data = self.scaleData 388 self.svm.train(data) 389 #w = self.svm.model.w 390 w = self.svm.model.warray 391 if self.svm.kernel.__class__.__name__ == "LinearRidge" : 392 wRidge = 0.0 393 for i in range(self.svm.model.numSV) : 394 wRidge += self.svm.model.alpha[i] * \ 395 self.svm.ridge[self.svm.model.svID[i]] 396 wRidge = abs(wRidge) 397 for i in range(len(data)) : 398 self.svm.ridge[i] *= wRidge 399 400 data.scale(w) 401 self.w = w 402 print 'scaled' 403 wc = numpy.compress(numpy.greater(w, 1e-3), w) 404 405 if numpy.allclose(wc, numpy.ones(len(wc), numpy.float), 0.3) : 406 raise StopIteration
407 408
409 - def selectFeatures(self, data, *options, **args):
410 '''XXX for multi-class -- do one against the rest 411 and use the absolute value of the average/maximum value of w to rescale 412 multi-class 413 ''' 414 415 if data.labels.numClasses != 2 : 416 raise ValueError, 'MU supports only two class problems' 417 418 self.initialize(data) 419 420 muIter = iter(self) 421 for f in muIter : pass 422 423 featuresToKeep = numpy.nonzero(numpy.greater(self.w, 1e-3))[0] 424 425 print 'numFeatures', len(featuresToKeep) 426 427 return featuresToKeep
428 429
430 -class Random (FeatureSelector) :
431 ''' 432 A feature selection method that keeps a random set of features 433 434 Construction:: 435 436 Random(numFeatures) 437 ''' 438
439 - def __init__(self, arg1, *options, **settings) :
440 441 if arg1.__class__ == self.__class__ : 442 other = arg1 443 self.numFeatures = other.numFeatures 444 elif type(arg1) == type(1) : 445 self.numFeatures = arg1 446 else : 447 raise ValueError, 'bad argument for Random constructor'
448
449 - def __repr__(self) :
450 rep = '<' + self.__class__.__name__ + ' instance>\n' 451 rep += 'number of features to keep : %d\n' % self.numFeatures 452 453 return rep
454
455 - def selectFeatures(self, data, *options, **args) :
456 457 if data.numFeatures <= self.numFeatures : 458 return 459 460 return misc.randsubset(data.numFeatures, self.numFeatures)
461 462
463 -class Filter (FeatureSelector) :
464 ''' 465 A simple feature selection method that filters features according 466 to a feature score. 467 It uses a feature score (instance of FeatureScore) to eliminate 468 features in one of three possible modes: 469 470 - keep a specified number of features [default] 471 - eliminate all features whose score is below some threshold 472 - eliminate all features whose score is a certain number of standard deviations 473 above that obtained using random labels 474 ''' 475
476 - def __init__(self, arg1, *options, **settings) :
477 """ 478 :Keywords: 479 - `numFeatures` - keep ``numFeatures`` features with the highest score 480 - `threshold` - keep all features with score above the threshold 481 - `sigma` - keep features whose score is above the average by this many 482 standard deviations 483 """ 484 self.sigma = 2.5 485 if arg1.__class__ == self.__class__ : 486 other = arg1 487 self.featureScore = other.featureScore.__class__(other.featureScore) 488 self.numFeatures = other.numFeatures 489 self.mode = other.mode 490 self.numRand = other.numRand 491 self.sigma = other.sigma 492 try : 493 self.threshold = other.threshold 494 except : 495 pass 496 try : 497 self.significance = other.significance 498 except : 499 pass 500 try : 501 self.numFeatures = other.numFeatures 502 except : 503 pass 504 elif hasattr(arg1, 'score') : 505 self.featureScore = arg1 506 self.mode = "byNum" 507 self.numFeatures = 20 508 self.numRand = 20 509 if 'numFeatures' in settings : 510 self.numFeatures = settings['numFeatures'] 511 self.mode = "byNum" 512 if 'sigma' in settings : 513 self.sigma = settings['sigma'] 514 self.mode = "bySignificance" 515 if 'threshold' in settings : 516 self.threshold = settings['threshold'] 517 self.mode = "byThreshold" 518 else : 519 raise ValueError, 'bad argument for Filter constructor'
520
521 - def __repr__(self) :
522 rep = '<' + self.__class__.__name__ + ' instance>\n' 523 rep += 'mode: ' + self.mode + '\n' 524 if self.mode == "byNum" : 525 rep += 'number of features to keep : %d\n' % self.numFeatures 526 elif self.mode == "bySignificance" : 527 rep += 'sigma : %f\n' \ 528 % self.sigma 529 530 elif self.mode == "byThreshold" : 531 rep += 'score threshold for keeping features : %f\n' % self.threshold 532 rep += self.featureScore.__repr__() 533 534 return rep
535
536 - def selectFeatures(self, data, targetClass=None, otherClass = None, *options, **args) :
537 538 s = self.featureScore.score(data, targetClass, otherClass, **args) 539 540 if self.mode == "byNum" : 541 featuresToEliminate = numpy.argsort(s)\ 542 [:data.numFeatures - self.numFeatures] 543 elif self.mode == "byThreshold" : 544 featuresToEliminate = numpy.nonzero(numpy.less(s, self.threshold))[0] 545 elif self.mode == "bySignificance" : 546 t = self.significanceThreshold(data) 547 self.thresholds = t 548 featuresToEliminate = numpy.nonzero(numpy.less(s, t))[0] 549 else : 550 raise ValueError, 'unknown elimination mode in filter' 551 552 print 'eliminating ',len(featuresToEliminate), ' features' 553 554 return misc.setminus(range(data.numFeatures), featuresToEliminate)
555 556
557 - def significanceThreshold(self, data) :
558 559 s = numpy.zeros((self.numRand,data.numFeatures), numpy.float) 560 561 for i in range(self.numRand) : 562 Y = labels.randomLabels(data.labels.Y) 563 s[i] = self.featureScore.score(data, Y = Y) 564 565 #t = [misc.inverseCumulative(s[:,j], self.significance) 566 # for j in range(data.numFeatures)] 567 #print t 568 #print max(t) 569 t = s.mean() + self.sigma * s.std() 570 return t
571 572
573 -def parseArgs(data, targetClass, otherClass = None, **args) :
574 '''parse arguments for a feature scoring function''' 575 576 if 'feature' in args : 577 feature = args['feature'] 578 else : 579 feature = None 580 if 'Y' in args : 581 Y = args['Y'] 582 if otherClass is None : 583 otherI = numpy.nonzero(numpy.not_equal(Y, targetClass))[0] 584 else : 585 otherI = numpy.nonzero(numpy.equal(Y, otherClass))[0] 586 targetClassSize = numpy.sum(numpy.equal(Y, targetClass)) 587 else : 588 Y = None 589 if otherClass is None : 590 otherI = numpy.nonzero(numpy.not_equal(data.labels.Y, targetClass))[0] 591 else : 592 otherI = data.labels.classes[otherClass] 593 targetClassSize = len(data.labels.classes[targetClass]) 594 595 otherClassSize = len(otherI) 596 597 return Y, targetClassSize, otherClassSize, otherI, feature
598 599
600 -def singleFeatureSuccRate(data, targetClass, otherClass = None, **args) :
601 602 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 603 data, targetClass, otherClass, **args) 604 605 if Y is None : Y = data.labels.Y 606 if data.__class__.__name__ != 'DataSet' : 607 raise ValueError, 'data should be of type DataSet' 608 609 Xsort = numpy.sort(data.X, 0) 610 d = data.numFeatures 611 n = len(data) 612 Isort = numpy.argsort(data.X, 0) 613 print Isort 614 print Y 615 succRate = numpy.zeros(d, numpy.float) 616 threshold = numpy.zeros(d, numpy.float) 617 num1 = numpy.sum(numpy.equal(Y, 1)) 618 num0 = n - num1 619 620 for i in range(d) : 621 succRate[i] = 0 622 num0below = 0 623 num1below = 0 624 for j in range(0, n - 1) : 625 if Y[Isort[j][i]] == 1 : 626 num1below += 1 627 else : 628 num0below += 1 629 num0above = num0 - num0below 630 num1above = num1 - num1below 631 currSuccRate = float(max(num0above + num1below, num0below + num1above)) / \ 632 float(n) 633 if currSuccRate > succRate[i] : 634 succRate[i] = currSuccRate 635 threshold[i] = (Xsort[j][i] + Xsort[j + 1][i]) / 2 636 637 return succRate,threshold
638 639
640 -def predictivity(data, targetClass, otherClass = None, **args) :
641 642 '''A feature score for discrete data; the score for feature i is: 643 s_i = P(Fi | C1) - P(Fi | C2), 644 where P(Fi | C) is the estimated probability of Feature i being nonzero given 645 the class variable 646 This is estimated as: 647 s_i = # of patterns in target class that have feature i / 648 no. of patterns in target class 649 - 650 # of patterns in other class that have feature i / 651 no. of patterns in other class 652 ''' 653 654 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 655 data, targetClass, otherClass, **args) 656 657 658 s1 = numpy.array(featureCount(data, targetClass=targetClass, Y=Y, 659 feature=feature)) / float(targetClassSize) 660 661 s2 = numpy.array(featureCount(data, I = otherI, Y=Y, 662 feature=feature)) / float(otherClassSize) 663 664 return (s1 - s2)
665 666
667 -def countDiff(data, targetClass, otherClass = None, **args) :
668 '''A feature score for discrete data; the score for feature i is: 669 s_i = (#(Fi | C ) - #(Fi | not C)) / #(Fi | C) 670 ''' 671 672 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 673 data, targetClass, otherClass, **args) 674 675 s1 = featureCount(data, targetClass=targetClass, Y=Y, 676 feature=feature) 677 678 s2 = featureCount(data, I = otherI, Y=Y, 679 feature=feature) 680 681 s = (s1 - s2) / float(targetClassSize) 682 683 return s
684 685
686 -def sensitivity(data, targetClass, otherClass = None, **args) :
687 '''A feature score for discrete data 688 (alternatively, with a threshold it could be used for continuous data) 689 s_i = #(Fi | C) / #(C) 690 ''' 691 692 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 693 data, targetClass, otherClass, **args) 694 695 return (featureCount(data, targetClass=targetClass, Y=Y, feature=feature) / 696 float(targetClassSize))
697 698 699
700 -def ppv(data, targetClass, otherClass = None, **args) :
701 '''A feature score for discrete data 702 s_i = #(Fi | C) / #(Fi) 703 ''' 704 705 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 706 data, targetClass, otherClass, **args) 707 708 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature) 709 710 s2 = featureCount(data, feature = feature) 711 712 numpy.putmask(s2, numpy.equal(s2, 0), 1) 713 714 if type(s1) == type(1) : 715 return float(s1) / float(s2) 716 else : 717 return numpy.array(s1, numpy.float)/s2
718
719 -def ppvThreshold(data, targetClass, otherClass = None, **args) :
720 '''A feature score for discrete data 721 s_i = #(Fi | C) / #(Fi) if #(Fi | C) > threshold and 0 otherwise 722 ''' 723 724 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 725 data, targetClass, otherClass, **args) 726 if 'threshold' in args : 727 threshold = args['threshold'] 728 else : 729 threshold = 2 730 731 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature) 732 733 numpy.putmask(s1, numpy.less_equal(s1, threshold), 0) 734 735 s2 = featureCount(data, feature = feature) 736 # avoid division by 0 : 737 numpy.putmask(s2, numpy.equal(s2, 0), 1) 738 739 if type(s1) == type(1) : 740 return float(s1) / float(s2) 741 else : 742 return numpy.array(s1, numpy.float)/s2
743 744
745 -def specificity(data, targetClass, otherClass = None, **args) :
746 '''A feature score for discrete data 747 s_i = #(Fi | C) / #(Fi) 748 749 or perhaps: 1 - #(Fi | not C) / #(not C) 750 751 ''' 752 753 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 754 data, targetClass, otherClass, **args) 755 756 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature) 757 758 s2 = featureCount(data, feature = feature) 759 760 numpy.putmask(s2, numpy.equal(s2, 0), 1) 761 762 if type(s1) == type(1) : 763 return float(s1) / float(s2) 764 else : 765 return numpy.array(s1, numpy.float)/s2
766 767
768 -def usefullness(data, targetClass, otherClass = None, **args) :
769 '''A feature score for discrete data 770 optional arguments: 771 threshold 772 fraction 773 ''' 774 775 if 'threshold' in args : 776 threshold = args['threshold'] 777 else : 778 threshold = 5 779 if 'fraction' in args : 780 fraction = args['fraction'] 781 else : 782 fraction = 0.0 783 784 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 785 data, targetClass, otherClass, **args) 786 787 threshold = max(threshold, fraction * float(targetClassSize)) 788 789 s1 = featureCount(data, targetClass=targetClass, Y=Y, feature=feature) 790 791 s2 = featureCount(data, I = otherI, Y=Y, 792 feature=feature) / float(otherClassSize) 793 794 s2 = 1 - s2 795 796 numpy.putmask(s2, numpy.less(s1, threshold), 0.0) 797 798 return s2
799 800
801 -def abundance(data, targetClass, otherClass = None, **args) :
802 803 '''Fraction of patterns that have a feature: A(F,C) = #(F | C) \ #(C)''' 804 805 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 806 data, targetClass, otherClass, **args) 807 808 s = featureCount(data, targetClass=targetClass, Y=Y, feature=feature) / \ 809 float(targetClassSize) 810 811 return s
812 813 814
815 -def oddsRatio(data, targetClass, otherClass = None, **args) :
816 817 818 Y, targetClassSize, otherClassSize, otherI, feature = parseArgs( 819 data, targetClass, otherClass, **args) 820 821 count1 = numpy.array(featureCount(data, targetClass=targetClass, Y=Y, 822 feature=feature), numpy.float) 823 count2 = numpy.array(featureCount(data, I=otherI, Y=Y, 824 feature=feature), numpy.float) 825 826 pseudoCount1 = 1.0 / float(targetClassSize) 827 pseudoCount2 = 1.0 / float(otherClassSize) 828 numpy.putmask(count1, numpy.equal(count1, 0), pseudoCount1) 829 numpy.putmask(count2, numpy.equal(count2, 0), pseudoCount2) 830 numpy.putmask(count1, numpy.equal(count1, targetClassSize), 831 targetClassSize - pseudoCount1) 832 numpy.putmask(count2, numpy.equal(count2, len(otherI)), 833 len(otherI) - pseudoCount2) 834 835 836 s = (count1 * (otherClassSize - count2)) / (count2 * (targetClassSize - count1)) 837 838 return s
839
840 -def logOddsRatio(data, targetClass, otherClass = None, **args) :
841 842 return numpy.log(oddsRatio(data, targetClass, otherClass, **args))
843 844 845
846 -def relief (data) :
847 848 if type(data.X[0]) == type({}) : 849 raise valueError, "Wrong type of dataset" 850 if data.labels.numClasses != 2 : 851 raise valueError, 'not a two class problem' 852 853 K = numpy.dot (data.X, numpy.transpose (data.X)) 854 855 w = numpy.zeros(data.numFeatures, numpy.float) 856 for i in range(len(data)) : 857 bestInClass = 0 858 simInClass = -1e10 859 bestOutOfClass = 0 860 simOutOfClass = -1e10 861 for j in range(len(data)) : 862 if j == i : continue 863 if data.labels.Y[i] == data.labels.Y[j] : 864 if K[i][j] > simInClass : 865 bestInClass = j 866 simInClass = K[i][j] 867 else : 868 if K[i][j] > simOutOfClass : 869 bestOutOfClass = j 870 simOutOfClass = K[i][j] 871 w += data.X[bestInClass] - data.X[bestOutOfClass] 872 873 return w / len(data)
874 875
876 -def golub(data, targetClass, otherClass, **args) :
877 '''The Golub feature score: 878 s = (mu1 - mu2) / sqrt(sigma1^2 + sigma2^2) 879 ''' 880 881 if 'Y' in args : 882 Y = args['Y'] 883 targetClassSize = numpy.sum(numpy.equal(Y, targetClass)) 884 otherClassSize = numpy.sum(numpy.equal(Y, otherClass)) 885 else : 886 Y = None 887 targetClassSize = data.labels.classSize[targetClass] 888 otherClassSize = data.labels.classSize[otherClass] 889 890 m1 = numpy.array(featureMean(data, targetClass, Y)) 891 m2 = numpy.array(featureMean(data, otherClass, Y)) 892 s1 = numpy.array(featureStd(data, targetClass, Y)) 893 s2 = numpy.array(featureStd(data, otherClass, Y)) 894 895 s = numpy.sqrt(s1**2 + s2**2) 896 m = (m1 + m2) / 2.0 897 898 # perfect features will have s[i] = 0, so need to take care of that: 899 numpy.putmask(s, numpy.equal(s, 0), m) 900 # features that are zero will still have s[i] = 0 so : 901 numpy.putmask(s, numpy.equal(s, 0) ,1) 902 903 g = (m1 - m2) / s 904 905 return g
906
907 -def succ(data, targetClass, otherClass, **args) :
908 """the score of feature j is the success rate of a classifier that 909 classifies into the target class all points whose value of the feature 910 are higher than some threshold (linear 1-d classifier). 911 """ 912 Y = data.labels.Y 913 numPos = float(data.labels.classSize[targetClass]) 914 numNeg = len(data) - numPos 915 s = numpy.zeros(data.numFeatures, numpy.float_) 916 values = numpy.zeros(data.numFeatures, numpy.float_) 917 balanced = False 918 if 'balanced' in args : 919 balanced = args['balanced'] 920 #negFrac = float(numNeg) / float(len(data)) 921 #posFrac = float(numPos) / float(len(data)) 922 for j in range(data.numFeatures) : 923 feat = data.getFeature(j) 924 I = numpy.argsort(feat) 925 feat = numpy.sort(feat) 926 posBelow = 0 927 negBelow = 0 928 for i in range(len(data)) : 929 if Y[I[i]] == targetClass : 930 posBelow += 1 931 else : 932 negBelow += 1 933 # the following if statement takes into account 934 # discrete data. in that case the decision is made only 935 # when the feature changes its value 936 if i < len(data)-1 and feat[i] != feat[i + 1] : 937 if balanced : 938 succRate = max(posBelow / numPos + (numNeg - negBelow) / numNeg, 939 (numPos - posBelow) / numPos + negBelow / numNeg) 940 else : 941 succRate = max(posBelow + (numNeg - negBelow), 942 (numPos - posBelow) + negBelow) 943 if succRate > s[j] : 944 s[j] = succRate 945 values[j] = feat[i] 946 947 if not balanced : 948 s = s / len(data) 949 else : 950 s = s / 2.0 951 952 if 'getValues' in args and args['getValues'] : 953 return s,values 954 else : 955 return s
956
957 -def balancedSucc(data, targetClass, otherClass, **args) :
958 """the score of feature j is the success rate of a classifier that 959 classifies into the target class all points whose value of the feature 960 are higher than some threshold (linear 1-d classifier). 961 """ 962 963 return succ(data, targetClass, otherClass, **{'balanced' : True})
964
965 -def roc(data, targetClass, otherClass, **args) :
966 967 rocN = None 968 if 'rocN' in args : 969 rocN = args['rocN'] 970 s = numpy.zeros(data.numFeatures, numpy.float_) 971 for i in range(data.numFeatures) : 972 featureValues = data.getFeature(i) 973 s[i] = assess.roc(None, data.labels.Y, featureValues, rocN, targetClass)[2] 974 #featureValues = - numpy.array(featureValues) 975 #s[i] = max(s[i], assess.roc(None, data.labels.Y, featureValues, rocN)[2]) 976 # note: taking the maximum of the roc score and its negative is the same 977 # as exchanging the labels, which is done by the FeatureScore anyhow. 978 979 return s
980 981
982 -def featureCount(data, *options, **args) :
983 ''' 984 returns a vector where component i gives the number of patterns where 985 feature i is nonzero 986 INPUTS: 987 data - a dataset 988 targetClass - class for which to count (optional, default behavior is 989 to look at all patterns) 990 Y - alternative label vector (optional) 991 feature - either a feature or list of features - counts the number of 992 patterns for which the feature or list of features is non-zero 993 I - a list of indices on which to do feature count 994 OPTIONS: 995 "complement" - look at the complement of the target class 996 ''' 997 998 singleFeature = 0 999 if 'feature' in args and args['feature'] is not None : 1000 feature = args['feature'] 1001 singleFeature = 1 1002 featureCount = 0 1003 else : 1004 featureCount = numpy.zeros(data.numFeatures) 1005 1006 if 'Y' in args and args['Y'] is not None : 1007 Y = args['Y'] 1008 elif 'labels' in args : 1009 Y = args['labels'].Y 1010 elif data.labels.L is not None : 1011 Y = data.labels.Y 1012 1013 if "targetClass" in args : 1014 targetClass = args['targetClass'] 1015 if "complement" in options : 1016 I = numpy.nonzero(numpy.not_equal(Y, targetClass))[0] 1017 else : 1018 I = numpy.nonzero(numpy.equal(Y, targetClass))[0] 1019 else : 1020 I = range(len(data)) 1021 1022 if 'I' in args : 1023 I = args['I'] 1024 1025 if singleFeature : 1026 featureCount = data.featureCount(feature, I) 1027 else : 1028 featureCount = data.featureCounts(I) 1029 1030 return featureCount
1031 1032
1033 -def featureMean(data, targetClass = None, Y = None) :
1034 '''returns a vector where component i is the mean of feature i 1035 INPUT: 1036 data - a dataset 1037 targetClass - class for which to take the mean (optional) 1038 Y - alternative label vector (optional) 1039 ''' 1040 1041 if targetClass is None : 1042 I = range(len(data)) 1043 elif Y is None : 1044 I = numpy.nonzero(numpy.equal(data.labels.Y, targetClass))[0] 1045 else : 1046 I = numpy.nonzero(numpy.equal(Y, targetClass))[0] 1047 1048 return data.mean(I)
1049 1050 1051
1052 -def featureStd(data, targetClass = None, Y = None) :
1053 '''returns a vector where component i is the standard deviation of feature i 1054 INPUT: 1055 data - a dataset 1056 targetClass - class for which to take the mean (optional) 1057 Y - alternative label vector (optional) 1058 ''' 1059 1060 if targetClass == None : 1061 I = range(len(data)) 1062 elif Y == None : 1063 I = numpy.nonzero(numpy.equal(data.labels.Y, targetClass))[0] 1064 else : 1065 I = numpy.nonzero(numpy.equal(Y, targetClass))[0] 1066 1067 if len(I) == 0 : 1068 return numpy.zeros(data.numFeatures, numpy.float_) 1069 1070 return data.std(I)
1071 1072
1073 -def eliminateSparseFeatures(data, threshold) :
1074 '''removes from the data features whose feature count is below a threshold 1075 data - a dataset 1076 threshold - number of occurrences of the feature below which it will be 1077 eliminated 1078 ''' 1079 1080 fCount = featureCount(data) 1081 1082 below = numpy.nonzero(numpy.less(fCount, threshold))[0] 1083 data.eliminateFeatures(below)
1084 1085 1086
1087 -def nonredundantFeatures(data, w = None) :
1088 '''Compute a set of nonredundant features for a 0/1 sparse dataset 1089 a feature is defined as redundant if there is another feature which has 1090 nonzero value for exactly the same patterns, and has a larger weight 1091 INPUT: a dataset and a list of weights for each feature in the data 1092 weights are optional. 1093 OUTPUT: a list of redundant features 1094 ''' 1095 1096 #data.featureView() 1097 1098 bestFeature = {} 1099 featureWeight = {} 1100 1101 for f in range(data.numFeatures) : 1102 if f % 100 == 0 : 1103 print f 1104 pattern = '' 1105 for i in range(len(data)) : 1106 if data.X[i].has_key(f) : 1107 pattern += '1' 1108 else : 1109 pattern += '0' 1110 if pattern in bestFeature : 1111 if w is not None : 1112 if featureWeight[pattern] < w[f] : 1113 featureWeight[pattern] = w[f] 1114 bestFeature[pattern] = f 1115 else : 1116 if w is not None : 1117 featureWeight[pattern] = w[f] 1118 bestFeature[pattern] = f 1119 1120 nonredundant = bestFeature.values() 1121 1122 return nonredundant
1123 1124
1125 -class FeatureScorer (object) :
1126 """base class for objects that have a 'score' function 1127 for scoring the features of a dataset 1128 """ 1129 1130 type = 'featureScorer' 1131
1132 - def score(self) :
1133 1134 raise NotImplementedError
1135 1136 train = score 1137
1138 - def test(self, data, *options, **args) :
1139 1140 pass
1141
1142 -class FeatureScore (FeatureSelector) :
1143 """ 1144 A class for scoring the features of a dataset 1145 USAGE: 1146 construction: 1147 f = FeatureScore(scoreName, mode = modeValue) 1148 or using copy construction : 1149 f = FeatureScore(otherFeatureScore) 1150 scoreName is the type of filter; available filters are: 1151 "predictivity", "oddsRatio", "golub" 1152 mode is one of the following: 1153 oneAgainstRest (default) 1154 oneAgainstOne 1155 """ 1156 1157 scoreFuncs = {"predictivity" : predictivity, 1158 "oddsRatio" : "oddsRatio", "logOddsRatio" : logOddsRatio, 1159 "golub" : golub, "countDiff" : countDiff, 1160 "usefullness" : usefullness, "abundance" : abundance, 1161 "specificity" : specificity, "ppv" : ppv, 1162 "ppvThreshold" : ppvThreshold, 1163 "succ" : succ, 1164 "balancedSucc" : balancedSucc, "roc" : roc} 1165 1166 1167 # multiClass tells whether a filter function handles multi-class data 1168 # otherwise, a feature is scored according to the maximum 1169 # pairwise score between classes 1170 1171 multiClass = ["IG"] 1172 1173 # asym tells whether a two-class filter function satisfies : 1174 # s(F,C1) = - s(F,C2) 1175 # for such functions a feature is scored as the absolute 1176 # value of the score when no class is given 1177 1178 asym = ["predictivity", "logOddsRatio", "golub"] 1179
1180 - def __init__(self, arg1 = None, *options, **args) :
1181 1182 self.mode = "oneAgainstOne" 1183 self.scoreName = "predictivity" 1184 self.scoreFunc = predictivity 1185 self.minClassSize = 5 1186 self.bothSides = True 1187 1188 if arg1.__class__ == self.__class__ : 1189 other = arg1 1190 self.mode = other.mode 1191 self.scoreName = other.scoreName 1192 self.scoreFunc = other.scoreFunc 1193 self.bothSides = other.bothSides 1194 elif arg1.__class__ == ''.__class__ : 1195 scoreName = arg1 1196 if scoreName in self.scoreFuncs : 1197 self.scoreFunc = self.scoreFuncs[scoreName] 1198 else : 1199 raise ValueError, 'unknown filter name' 1200 self.scoreName = scoreName 1201 elif arg1.__class__.__base__.__name__ == 'FeatureScorer' : 1202 self.scoreFunc = arg1.score 1203 self.scoreName = '' 1204 1205 if 'mode' in args : 1206 if args['mode'] == "oneAgainstRest" : 1207 self.mode = "oneAgainstRest" 1208 if 'minClassSize' in args : 1209 self.minClassSize = args['minClassSize']
1210 1211
1212 - def __repr__(self) :
1213 rep = '<' + self.__class__.__name__ + ' instance>\n' 1214 rep += 'score name : ' + self.scoreName + '\n' 1215 rep += 'mode : ' + self.mode + '\n' 1216 1217 return rep
1218 1219 1220
1221 - def score(self, data, *options, **args) :
1222 1223 if 'targetClass' in args : 1224 targetClass = args['targetClass'] 1225 else : 1226 targetClass = None 1227 if 'otherClass' in args : 1228 otherClass = args['otherClass'] 1229 else : 1230 otherClass = None 1231 1232 if (targetClass is not None and otherClass is not None) or ( 1233 self.scoreName in self.multiClass) : 1234 return self.scoreFunc(data, targetClass, otherClass, **args) 1235 elif data.labels.numClasses == 2 : 1236 return self._score(data, **args) 1237 elif self.mode == "oneAgainstRest" : 1238 if targetClass is not None : 1239 labels = labels.oneAgainstRest(data.labels, targetClass) 1240 return self._score(data, 1, 0, Y=labels.Y) 1241 else : 1242 raise ValueError, 'need to specify a target class' 1243 elif self.mode == 'oneAgainstOne' : 1244 return self.oneAgainstOne(data, targetClass, **args)
1245 1246 train = score 1247
1248 - def _score(self, data, class1 = None, class2 = None, **args) :
1249 1250 if class1 is None and class2 is None : 1251 class1 = 0 1252 class2 = 1 1253 1254 if self.scoreName in self.asym or not self.bothSides : 1255 s = numpy.absolute( 1256 self.scoreFunc(data, class1, class2, **args)) 1257 else : 1258 s = numpy.maximum( 1259 self.scoreFunc(data, class1, class2, **args), 1260 self.scoreFunc(data, class2, class1, **args)) 1261 1262 return s
1263
1264 - def oneAgainstOne(self, data, targetClass, **args) :
1265 '''XXXX change maximum into average or add this as another option''' 1266 1267 if 'Y' in args : 1268 Y = args['Y'] 1269 classSize = misc.count(Y) 1270 else : 1271 classSize = data.labels.classSize 1272 1273 s = numpy.zeros(data.numFeatures, numpy.float_) 1274 1275 if targetClass is None : 1276 for class1 in range(data.labels.numClasses - 1) : 1277 for class2 in range(class1 + 1, data.labels.numClasses) : 1278 if (classSize[class1] > self.minClassSize and 1279 classSize[class2] > self.minClassSize) : 1280 1281 t = self._score(data, class1, class2, **args) 1282 s = numpy.maximum(s, t) 1283 1284 else : 1285 for class2 in range(data.labels.numClasses) : 1286 if class2 != targetClass and classSize[class2] > self.minClassSize: 1287 t = self._score(data, class1, class2, **args) 1288 s = numpy.maximum(s, t) 1289 1290 return s
1291
1292 -class BackwardSelector (FeatureSelector) :
1293
1294 - def __init__(self, arg, **args) :
1295 1296 self.measure = 'successRate' 1297 self.targetNumFeatures = 2 1298 if arg.__class__ == self.__class__ : 1299 self.measure = arg.measure 1300 self.targetNumFeatures = arg.targetNumfeatures 1301 self.classifier = arg.classifier.__class(arg.classifier) 1302 else : 1303 self.classifier = arg.__class__(arg) 1304 if 'targetNumFeatures' in args : 1305 self.targetNumFeatures = args['targetNumFeatures'] 1306 if 'measure' in args : 1307 self.measure = args['measure']
1308
1309 - def selectFeatures(self, _data, *options, **args) :
1310 1311 self.eliminated = [] 1312 self.measures = [] 1313 cvArgs = {} 1314 import re 1315 rocExp = re.compile(r"roc(?P<rocN>[0-9]+)area") 1316 match = rocExp.match(self.measure) 1317 if match is not None : 1318 measureStr = 'rocNarea' 1319 cvArgs['rocN'] = match.groupdict()['rocN'] 1320 else : 1321 measureStr = self.measure 1322 1323 print cvArgs 1324 data = _data.__class__(_data, deepcopy = True) 1325 for i in range(self.targetNumFeatures, _data.numFeatures) : 1326 maxScore = 0 1327 # loop over the CURRENT features 1328 for feature in range(data.numFeatures) : 1329 featureName = data.featureID[feature] 1330 data.eliminateFeatures([feature]) 1331 res = self.classifier.stratifiedCV(data, **cvArgs) 1332 score = getattr(res, measureStr) 1333 if score > maxScore : 1334 maxScore = score 1335 bestFeatureName = featureName 1336 data = _data.__class__(_data, deepcopy = True) 1337 data.eliminateFeatures(data.featureNames2IDs(self.eliminated)) 1338 data = _data.__class__(_data, deepcopy = True) 1339 self.eliminated.append(bestFeatureName) 1340 data.eliminateFeatures(data.featureNames2IDs(self.eliminated)) 1341 self.measures.append(maxScore) 1342 1343 return misc.setminus(range(_data.numFeatures), 1344 _data.featureNames2IDs(self.eliminated))
1345
1346 -def linearlySeparable (data) :
1347 '''returns 1 if data is linearly separable and 0 otherwise. 1348 More specifically, it trains a soft margin SVM and checks if all 1349 training points are correclty classified 1350 ''' 1351 1352 s = svm.SVM(C = 1000) 1353 s.train(data) 1354 r = s.test(data) 1355 r.computeStats() 1356 1357 successRate = r.get('successRate') 1358 if successRate == 1 : 1359 return True 1360 else : 1361 return False
1362 1363
1364 -def extractNumFeatures(resultsFileName) :
1365 1366 r = myio.load(resultsFileName) 1367 1368 numFeatures = {} 1369 if type(r) == type({}) : 1370 info = misc.extractAttribute(r, 'foldInfo') 1371 for key in info : 1372 numFeat = [] 1373 for lines in info[key] : 1374 for line in lines.split('\n') : 1375 if line.find('number of features') == 0 : 1376 numFeat.append(float(line.split(':')[1])) 1377 numFeatures[key] = numpy.average(numFeat) 1378 return numFeatures
1379
1380 -def weights2ranks(weights, data) :
1381 1382 if type(weights) == type({}) : 1383 weights = [weights[data.featureID[i]] 1384 for i in range(data.numFeatures)] 1385 weights = numpy.array(weights) 1386 I = numpy.argsort(-weights) 1387 ranks = [data.featureID[i] for i in I] 1388 1389 return ranks
1390
1391 -def featureReport(data, score = 'roc', targetClass = 1, otherClass = 0) :
1392 1393 if score == 'roc' : 1394 s = roc(data, targetClass, otherClass) 1395 elif score == 'golub' : 1396 s = golub(data, targetClass, otherClass) 1397 1398 for i in range(data.numFeatures) : 1399 print data.featureID[i], s[i]
1400