Package PyML :: Package utils :: Module salstat_stats
[hide private]
[frames] | no frames]

Source Code for Module PyML.utils.salstat_stats

   1  # stats.py - reworked module for statistical analysis using OOP 
   2  """ 
   3  The complete code in this file is copyright 2002 Alan James Salmoni, and is 
   4  released under version 2 or later of the GNU General Public Licence (GPL). 
   5  See the enclosed file COPYING for the full text of the licence. 
   6   
   7  Significant parts of the code were taken from stats.py by Gary Strangman of 
   8  Harvard University (c) Not sure what year, Gary Strangman, released under the  
   9  GNU General Public License.""" 
  10   
  11  import math 
  12   
  13  import copy 
  14   
  15  #import salstat 
  16   
  17  # Short routines used in the functional constructs to reduce analysis time 
  18   
19 -def add(a,b): return a+b
20
21 -def squared(a): return math.pow(a, 2)
22
23 -def cubed(a): return math.pow(a, 3)
24
25 -def quaded(a): return math.pow(a, 4)
26
27 -def multiply(a,b): return a*b
28
29 -def obsMinusExp(a,b): return (a-b)**2/b
30
31 -def diffsquared(a,b): return (a-b)**2
32
33 -def higher(a,b):
34 if a>b: 35 return 1 36 else: 37 return 0
38
39 -def lower(a,b):
40 if a<b: 41 return 1 42 else: 43 return 0
44 45
46 -def shellsort(inlist):
47 """ 48 Shellsort algorithm. Sorts a 1D-list. 49 50 Usage: shellsort(inlist) 51 Returns: sorted-inlist, sorting-index-vector (for original list) 52 """ 53 n = len(inlist) 54 svec = copy.deepcopy(inlist) 55 ivec = range(n) 56 gap = n/2 # integer division needed 57 while gap >0: 58 for i in range(gap,n): 59 for j in range(i-gap,-1,-gap): 60 while j>=0 and svec[j]>svec[j+gap]: 61 temp = svec[j] 62 svec[j] = svec[j+gap] 63 svec[j+gap] = temp 64 itemp = ivec[j] 65 ivec[j] = ivec[j+gap] 66 ivec[j+gap] = itemp 67 gap = gap / 2 # integer division needed 68 # svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]] 69 return svec, ivec
70 71
72 -def rankdata(inlist):
73 """ 74 Ranks the data in inlist, dealing with ties appropritely. Assumes 75 a 1D inlist. Adapted from Gary Perlman's |Stat ranksort. 76 77 Usage: rankdata(inlist) 78 Returns: a list of length equal to inlist, containing rank scores 79 """ 80 n = len(inlist) 81 svec, ivec = shellsort(inlist) 82 sumranks = 0 83 dupcount = 0 84 newlist = [0]*n 85 for i in range(n): 86 sumranks = sumranks + i 87 dupcount = dupcount + 1 88 if i==n-1 or svec[i] <> svec[i+1]: 89 averank = sumranks / float(dupcount) + 1 90 for j in range(i-dupcount+1,i+1): 91 newlist[ivec[j]] = averank 92 sumranks = 0 93 dupcount = 0 94 return newlist
95 96
97 -def tiecorrect(rankvals):
98 """ 99 Corrects for ties in Mann Whitney U and Kruskal Wallis H tests. See 100 Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences. 101 New York: McGraw-Hill. Code adapted from |Stat rankind.c code. 102 103 Usage: tiecorrect(rankvals) 104 Returns: T correction factor for U or H 105 """ 106 sorted = copy.copy(rankvals) 107 sorted.sort() 108 posn = range(len(sorted)) 109 n = len(sorted) 110 T = 0.0 111 i = 0 112 while (i<n-1): 113 if sorted[i] == sorted[i+1]: 114 nties = 1 115 while (i<n-1) and (sorted[i] == sorted[i+1]): 116 nties = nties +1 117 i = i +1 118 T = T + nties**3 - nties 119 i = i+1 120 T = T / float(n**3-n) 121 return 1.0 - T
122 123
124 -def sum (inlist):
125 """ 126 Returns the sum of the items in the passed list. 127 128 Usage: sum(inlist) 129 """ 130 s = 0 131 for item in inlist: 132 s = s + item 133 return s
134 135 136 # this is used by the single factor anova routines (only I think) & the SS 137 # value may not actually be needed!
138 -def minimaldescriptives(inlist):
139 """this function takes a clean list of data and returns the N, sum, mean 140 and sum of squares. """ 141 N = 0 142 sum = 0.0 143 SS = 0.0 144 for i in range(len(inlist)): 145 N = N + 1 146 sum = sum + inlist[i] 147 SS = SS + (inlist[i] ** 2) 148 mean = sum / float(N) 149 return N, sum, mean, SS
150 151 152 ########################### 153 ## Probability functions ## 154 ########################### 155
156 -def chisqprob(chisq,df):
157 """ 158 Returns the (1-tailed) probability value associated with the provided 159 chi-square value and df. Adapted from chisq.c in Gary Perlman's |Stat. 160 161 Usage: chisqprob(chisq,df) 162 """ 163 BIG = 20.0 164 def ex(x): 165 BIG = 20.0 166 if x < -BIG: 167 return 0.0 168 else: 169 return math.exp(x)
170 171 if chisq <=0 or df < 1: 172 return 1.0 173 a = 0.5 * chisq 174 if df%2 == 0: 175 even = 1 176 else: 177 even = 0 178 if df > 1: 179 y = ex(-a) 180 if even: 181 s = y 182 else: 183 s = 2.0 * zprob(-math.sqrt(chisq)) 184 if (df > 2): 185 chisq = 0.5 * (df - 1.0) 186 if even: 187 z = 1.0 188 else: 189 z = 0.5 190 if a > BIG: 191 if even: 192 e = 0.0 193 else: 194 e = math.log(math.sqrt(math.pi)) 195 c = math.log(a) 196 while (z <= chisq): 197 e = math.log(z) + e 198 s = s + ex(c*z-a-e) 199 z = z + 1.0 200 return s 201 else: 202 if even: 203 e = 1.0 204 else: 205 e = 1.0 / math.sqrt(math.pi) / math.sqrt(a) 206 c = 0.0 207 while (z <= chisq): 208 e = e * (a/float(z)) 209 c = c + e 210 z = z + 1.0 211 return (c*y+s) 212 else: 213 return s 214
215 -def inversechi(prob, df):
216 """This function calculates the inverse of the chi square function. Given 217 a p-value and a df, it should approximate the critical value needed to 218 achieve these functions. Adapted from Gary Perlmans critchi function in 219 C. Apologies if this breaks copyright, but no copyright notice was 220 attached to the relevant file.""" 221 minchisq = 0.0 222 maxchisq = 99999.0 223 chi_epsilon = 0.000001 224 if (prob <= 0.0): 225 return maxchisq 226 elif (prob >= 1.0): 227 return 0.0 228 chisqval = df / math.sqrt(prob) 229 while ((maxchisq - minchisq) > chi_epsilon): 230 if (chisqprob(chisqval, df) < prob): 231 maxchisq = chisqval 232 else: 233 minchisq = chisqval 234 chisqval = (maxchisq + minchisq) * 0.5 235 return chisqval
236
237 -def erfcc(x):
238 """ 239 Returns the complementary error function erfc(x) with fractional 240 error everywhere less than 1.2e-7. Adapted from Numerical Recipies. 241 242 Usage: erfcc(x) 243 """ 244 z = abs(x) 245 t = 1.0 / (1.0+0.5*z) 246 ans = t * math.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t* \ 247 (0.09678418+t*(-0.18628806+t* \ 248 (0.27886807+t*(-1.13520398+t* \ 249 (1.48851587+t*(-0.82215223+t* \ 250 0.17087277))))))))) 251 if x >= 0: 252 return ans 253 else: 254 return 2.0 - ans
255 256
257 -def zprob(z):
258 """ 259 Returns the area under the normal curve 'to the left of' the given z value. 260 Thus, 261 for z<0, zprob(z) = 1-tail probability 262 for z>0, 1.0-zprob(z) = 1-tail probability 263 for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability 264 Adapted from z.c in Gary Perlman's |Stat. 265 266 Usage: zprob(z) 267 """ 268 Z_MAX = 6.0 # maximum meaningful z-value 269 if z == 0.0: 270 x = 0.0 271 else: 272 y = 0.5 * math.fabs(z) 273 if y >= (Z_MAX*0.5): 274 x = 1.0 275 elif (y < 1.0): 276 w = y*y 277 x = ((((((((0.000124818987 * w 278 -0.001075204047) * w +0.005198775019) * w 279 -0.019198292004) * w +0.059054035642) * w 280 -0.151968751364) * w +0.319152932694) * w 281 -0.531923007300) * w +0.797884560593) * y * 2.0 282 else: 283 y = y - 2.0 284 x = (((((((((((((-0.000045255659 * y 285 +0.000152529290) * y -0.000019538132) * y 286 -0.000676904986) * y +0.001390604284) * y 287 -0.000794620820) * y -0.002034254874) * y 288 +0.006549791214) * y -0.010557625006) * y 289 +0.011630447319) * y -0.009279453341) * y 290 +0.005353579108) * y -0.002141268741) * y 291 +0.000535310849) * y +0.999936657524 292 if z > 0.0: 293 prob = ((x+1.0)*0.5) 294 else: 295 prob = ((1.0-x)*0.5) 296 return prob
297 298
299 -def ksprob(alam):
300 """ 301 Computes a Kolmolgorov-Smirnov t-test significance level. Adapted from 302 Numerical Recipies. 303 304 Usage: ksprob(alam) 305 """ 306 fac = 2.0 307 sum = 0.0 308 termbf = 0.0 309 a2 = -2.0*alam*alam 310 for j in range(1,201): 311 term = fac*math.exp(a2*j*j) 312 sum = sum + term 313 if math.fabs(term)<=(0.001*termbf) or math.fabs(term)<(1.0e-8*sum): 314 return sum 315 fac = -fac 316 termbf = math.fabs(term) 317 return 1.0 # Get here only if fails to converge; was 0.0!!
318 319
320 -def fprob (dfnum, dfden, F):
321 """ 322 Returns the (1-tailed) significance level (p-value) of an F 323 statistic given the degrees of freedom for the numerator (dfR-dfF) and 324 the degrees of freedom for the denominator (dfF). 325 326 Usage: fprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn 327 """ 328 p = betai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F)) 329 return p
330
331 -def inversef(prob, df1, df2):
332 """This function returns the f value for a given probability and 2 given 333 degrees of freedom. It is an approximation using the fprob function. 334 Adapted from Gary Perlmans critf function - apologies if copyright is 335 broken, but no copyright notice was attached """ 336 f_epsilon = 0.000001 337 maxf = 9999.0 338 minf = 0.0 339 if (prob <= 0.0) or (prob >= 1.0): 340 return 0.0 341 fval = 1.0 / prob 342 while (abs(maxf - minf) > f_epsilon): 343 if fprob(fval, df1, df2) < prob: 344 maxf = fval 345 else: 346 minf = fval 347 fval = (maxf + minf) * 0.5 348 return fval
349 350
351 -def betacf(a,b,x):
352 """ 353 This function evaluates the continued fraction form of the incomplete 354 Beta function, betai. (Adapted from: Numerical Recipies in C.) 355 356 Usage: betacf(a,b,x) 357 """ 358 ITMAX = 200 359 EPS = 3.0e-7 360 361 bm = az = am = 1.0 362 qab = a+b 363 qap = a+1.0 364 qam = a-1.0 365 bz = 1.0-qab*x/qap 366 for i in range(ITMAX+1): 367 em = float(i+1) 368 tem = em + em 369 d = em*(b-em)*x/((qam+tem)*(a+tem)) 370 ap = az + d*am 371 bp = bz+d*bm 372 d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem)) 373 app = ap+d*az 374 bpp = bp+d*bz 375 aold = az 376 am = ap/bpp 377 bm = bp/bpp 378 az = app/bpp 379 bz = 1.0 380 if (abs(az-aold)<(EPS*abs(az))): 381 return az
382 #print 'a or b too big, or ITMAX too small in Betacf.' 383 384
385 -def gammln(xx):
386 """ 387 Returns the gamma function of xx. 388 Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. 389 (Adapted from: Numerical Recipies in C.) 390 391 Usage: gammln(xx) 392 """ 393 394 coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, 395 0.120858003e-2, -0.536382e-5] 396 x = xx - 1.0 397 tmp = x + 5.5 398 tmp = tmp - (x+0.5)*math.log(tmp) 399 ser = 1.0 400 for j in range(len(coeff)): 401 x = x + 1 402 ser = ser + coeff[j]/x 403 return -tmp + math.log(2.50662827465*ser)
404 405
406 -def betai(a,b,x):
407 """ 408 Returns the incomplete beta function: 409 410 I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) 411 412 where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma 413 function of a. The continued fraction formulation is implemented here, 414 using the betacf function. (Adapted from: Numerical Recipies in C.) 415 416 Usage: betai(a,b,x) 417 """ 418 if (x<0.0 or x>1.0): 419 raise ValueError, 'Bad x in lbetai' 420 if (x==0.0 or x==1.0): 421 bt = 0.0 422 else: 423 bt = math.exp(gammln(a+b)-gammln(a)-gammln(b)+a*math.log(x)+b* 424 math.log(1.0-x)) 425 if (x<(a+1.0)/(a+b+2.0)): 426 return bt*betacf(a,b,x)/float(a) 427 else: 428 return 1.0-bt*betacf(b,a,1.0-x)/float(b)
429 430 431 432 ########################### 433 ## Test Classes ## 434 ########################### 435 436 437 """ class for continuous descriptive statistics. The variable "inlist" is 438 passed after being cleaned of missing data""" 439
440 -class FullDescriptives:
441 - def __init__(self, inlist, name = '', missing = 0):
442 self.Name = name 443 self.missing = missing 444 self.N = len(inlist) 445 self.sum = reduce(add, inlist) 446 try: 447 self.mean = self.sum / float(self.N) 448 except ZeroDivisionError: 449 self.mean = 0.0 450 self.sumsquares = reduce(add, map(squared, inlist)) 451 difflist = [] 452 self.sortlist = copy.copy(inlist) 453 self.sortlist.sort() 454 self.minimum = self.sortlist[0] 455 self.maximum = self.sortlist[len(self.sortlist)-1] 456 self.range = self.maximum - self.minimum 457 self.harmmean=0.0 458 if (self.N % 2): 459 self.median = self.sortlist[(self.N + 1) / 2] 460 else: 461 self.median = self.sortlist[self.N / 2] # not ideal, but works""" 462 # median of ranks - useful in comparisons for KW & Friedmans 463 ranklist = rankdata(self.sortlist) 464 if (self.N % 2): 465 self.medianranks = ranklist[(self.N + 1) / 2] 466 else: 467 self.medianranks = ranklist[self.N / 2] 468 self.mad = 0.0 469 self.numberuniques = 0 470 for i in range(self.N): 471 difflist.append(inlist[i] - self.mean) 472 self.mad = self.mad + (inlist[i] - self.median) 473 uniques = 1 474 for j in range(self.N): 475 if (i != j): 476 if (inlist[i] == inlist[j]): 477 uniques = 0 478 if uniques: 479 self.numberuniques = self.numberuniques + 1 480 if (inlist[i] != 0.0): 481 self.harmmean = self.harmmean + (1.0/inlist[i]) 482 if (self.harmmean != 0.0): 483 self.harmmean = self.N / self.harmmean 484 self.ssdevs = reduce(add, map(squared, difflist)) 485 self.geomean = reduce(multiply, difflist) 486 try: 487 self.samplevar = self.ssdevs / float(self.N - 1) 488 except ZeroDivisionError: 489 self.samplevar = 0.0 490 try: 491 moment2 = self.ssdevs / float(self.N) 492 moment3 = reduce(add, map(cubed, difflist)) / float(self.N) 493 moment4 = reduce(add, map(quaded, difflist)) / float(self.N) 494 self.variance = self.ssdevs / float(self.N) 495 self.stddev = math.sqrt(self.samplevar) 496 self.coeffvar = self.stddev / self.mean 497 self.skewness = moment3 / (moment2 * math.sqrt(moment2)) 498 self.kurtosis = (moment4 / math.pow(moment2, 2)) - 3.0 499 except ZeroDivisionError: 500 moment2 = 0.0 501 moment3 = 0.0 502 moment4 = 0.0 503 self.variance = 0.0 504 self.stderr = 0.0 505 self.coeffvar = 0.0 506 self.skewness = 0.0 507 self.kurtosis = 0.0 508 self.stderr = self.stddev / math.sqrt(self.N) 509 h = {} 510 for n in inlist: 511 try: h[n] = h[n]+1 512 except KeyError: h[n] = 1 513 a = map(lambda x: (x[1], x[0]), h.items()) 514 self.mode = max(a)[1]
515 516 # class for one sample tests - and it works!!! 517
518 -class OneSampleTests:
519 - def __init__(self, data1, name = '', missing = 0):
520 self.d1 = FullDescriptives(data1, name, missing)
521
522 - def OneSampleTTest(self, usermean):
523 if self.d1.N < 2: 524 self.t = 1.0 525 self.prob = -1.0 526 else: 527 self.df = self.d1.N - 1 528 svar = (self.df * self.d1.samplevar) / float(self.df) 529 self.t = (self.d1.mean - usermean) / math.sqrt(svar*(1.0/self.d1.N)) 530 self.prob = betai(0.5*self.df,0.5,float(self.df)/(self.df+ \ 531 self.t*self.t))
532
533 - def OneSampleSignTest(self, data1, usermean):
534 self.nplus=0 535 self.nminus=0 536 for i in range(len(data1)): 537 if (data1[i] < usermean): 538 self.nplus=self.nplus+1 539 if (data1[i] > usermean): 540 self.nminus=self.nminus+1 541 self.ntotal = add(self.nplus, self.nminus) 542 try: 543 self.z=(self.nplus-(self.ntotal/2)/math.sqrt(self.ntotal/2)) 544 except ZeroDivisionError: 545 self.z=0 546 self.prob=-1.0 547 else: 548 self.prob=erfcc(abs(self.z) / 1.4142136)
549
550 - def ChiSquareVariance(self, usermean):
551 self.df = self.d1.N - 1 552 try: 553 self.chisquare = (self.d1.stderr / usermean) * self.df 554 except ZeroDivisionError: 555 self.chisquare = 0.0 556 self.prob = chisqprob(self.chisquare, self.df)
557 558 559 # class for two sample tests - instantiates descriptives class for both 560 # data sets, then has each test as a method 561
562 -class TwoSampleTests:
563
564 - def __init__(self, data1, data2, name1 = '', name2 = '', \ 565 missing1=0,missing2=0):
566 self.d1 = FullDescriptives(data1, name1, missing1) 567 self.d2 = FullDescriptives(data2, name2, missing2)
568
569 - def TTestUnpaired(self):
570 self.df = (self.d1.N + self.d2.N) - 2 571 svar = ((self.d1.N-1)*self.d1.samplevar+(self.d2.N-1)* \ 572 self.d2.samplevar)/float(self.df) 573 self.t = (self.d1.mean-self.d2.mean)/math.sqrt(svar* \ 574 (1.0/self.d1.N + 1.0/self.d2.N)) 575 self.prob = betai(0.5*self.df,0.5,float(self.df)/(self.df+self.t* \ 576 self.t))
577
578 - def TTestPaired(self, data1, data2):
579 if (self.d1.N != self.d2.N): 580 self.p = -1.0 581 else: 582 cov = 0.0 583 self.df = self.d1.N - 1 584 for i in range(self.d1.N): 585 cov = cov + ((data1[i] - self.d1.mean) * (data2[i] - \ 586 self.d2.mean)) 587 cov = cov / float(self.df) 588 sd = math.sqrt((self.d1.samplevar + self.d2.samplevar - 2.0 * \ 589 cov) / float(self.d1.N)) 590 try: 591 self.t = (self.d1.mean - self.d2.mean) / sd 592 self.prob = betai(0.5*self.df,0.5,float(self.df)/(self.df+ \ 593 self.t*self.t)) 594 except ZeroDivisionError: 595 self.t = -1.0 596 self.prob = 0.0
597
598 - def PearsonsCorrelation(self, data1, data2):
599 TINY = 1.0e-60 600 if (self.d1.N != self.d2.N): 601 self.p = -1.0 602 else: 603 summult = reduce(add, map(multiply, data1, data2)) 604 r_num = self.d1.N * summult - self.d1.sum * self.d2.sum 605 r_left = self.d1.N*self.d1.sumsquares-(self.d1.sum**2) 606 r_right= self.d2.N*self.d2.sumsquares-(self.d2.sum**2) 607 r_den = math.sqrt(r_left*r_right) 608 self.r = r_num / r_den 609 self.df = self.d1.N - 2 610 self.t = self.r*math.sqrt(self.df/((1.0-self.r+TINY)* \ 611 (1.0+self.r+TINY))) 612 self.prob = betai(0.5*self.df,0.5,self.df/float \ 613 (self.df+self.t*self.t))
614
615 - def FTest(self, uservar):
616 try: 617 self.f = (self.d1.samplevar / self.d2.samplevar) / uservar 618 except ZeroDivisionError: 619 self.f = 1.0 620 self.df1 = self.d1.N - 1 621 self.df2 = self.d2.N - 1 622 self.prob=fprob(self.df1, self.df2, self.f)
623
624 - def TwoSampleSignTest(self, data1, data2):
625 if (self.d1.N != self.d2.N): 626 self.prob=-1.0 627 else: 628 nplus=map(higher,data1,data2).count(1) 629 nminus=map(lower,data1,data2).count(1) 630 self.ntotal=nplus-nminus 631 mean=self.d1.N / 2 632 sd = math.sqrt(mean) 633 self.z = (nplus-mean)/sd 634 self.prob = erfcc(abs(self.z)/1.4142136)
635
636 - def KendallsTau(self, data1, data2):
637 n1 = 0 638 n2 = 0 639 iss = 0 640 for j in range(self.d1.N-1): 641 for k in range(j,self.d2.N): 642 a1 = data1[j] - data1[k] 643 a2 = data2[j] - data2[k] 644 aa = a1 * a2 645 if (aa): # neither list has a tie 646 n1 = n1 + 1 647 n2 = n2 + 1 648 if aa > 0: 649 iss = iss + 1 650 else: 651 iss = iss -1 652 else: 653 if (a1): 654 n1 = n1 + 1 655 else: 656 n2 = n2 + 1 657 self.tau = iss / math.sqrt(n1*n2) 658 svar = (4.0*self.d1.N+10.0) / (9.0*self.d1.N*(self.d1.N-1)) 659 self.z = self.tau / math.sqrt(svar) 660 self.prob = erfcc(abs(self.z)/1.4142136)
661
662 - def KolmogorovSmirnov(self, data1, data2):
663 j1 = 0 664 j2 = 0 665 fn1 = 0.0 666 fn2 = 0.0 667 self.d = 0.0 668 data3 = self.d1.sortlist 669 data4 = self.d2.sortlist 670 while j1 < self.d1.N and j2 < self.d2.N: 671 d1=data3[j1] 672 d2=data4[j2] 673 if d1 <= d2: 674 fn1 = (j1)/float(self.d1.N) 675 j1 = j1 + 1 676 if d2 <= d1: 677 fn2 = (j2)/float(self.d2.N) 678 j2 = j2 + 1 679 dt = (fn2-fn1) 680 if math.fabs(dt) > math.fabs(self.d): 681 self.d = dt 682 try: 683 en = math.sqrt(self.d1.N*self.d2.N/float(self.d1.N+self.d2.N)) 684 self.prob = ksprob((en+0.12+0.11/en)*abs(self.d)) 685 except: 686 self.prob = 1.0
687
688 - def SpearmansCorrelation(self, data1, data2):
689 TINY = 1e-30 690 if self.d1.N <> self.d2.N: 691 self.prob= -1.0 692 else: 693 rankx = rankdata(data1) 694 ranky = rankdata(data2) 695 dsq = reduce(add, map(diffsquared, rankx, ranky)) 696 self.rho = 1 - 6*dsq / float(self.d1.N*(self.d1.N**2-1)) 697 self.t = self.rho * math.sqrt((self.d1.N-2) / \ 698 ((self.rho+1.0+TINY)*(1.0-self.rho+TINY))) 699 self.df = self.d1.N-2 700 self.prob = betai(0.5*self.df,0.5,self.df/(self.df+self.t*self.t))
701 702
703 - def RankSums(self, data1, data2):
704 x = copy.copy(data1) 705 y = copy.copy(data2) 706 alldata = x + y 707 ranked = rankdata(alldata) 708 x = ranked[:self.d1.N] 709 y = ranked[self.d1.N:] 710 s = reduce(add, x) 711 expected = self.d1.N*(self.d1.N+self.d2.N+1) / 2.0 712 self.z = (s - expected) / math.sqrt(self.d1.N*self.d2.N* \ 713 (self.d2.N+self.d2.N+1)/12.0) 714 self.prob = 2*(1.0 -zprob(abs(self.z)))
715 716
717 - def SignedRanks(self, data1, data2):
718 if self.d1.N <> self.d2.N: 719 self.prob = -1.0 720 else: 721 d=[] 722 for i in range(self.d1.N): 723 diff = data1[i] - data2[i] 724 if diff <> 0: 725 d.append(diff) 726 count = len(d) 727 absd = map(abs,d) 728 absranked = rankdata(absd) 729 r_plus = 0.0 730 r_minus = 0.0 731 for i in range(len(absd)): 732 if d[i] < 0: 733 r_minus = r_minus + absranked[i] 734 else: 735 r_plus = r_plus + absranked[i] 736 self.wt = min(r_plus, r_minus) 737 mn = count * (count+1) * 0.25 738 se = math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0) 739 self.z = math.fabs(self.wt-mn) / se 740 self.prob = 2*(1.0 -zprob(abs(self.z)))
741
742 - def MannWhitneyU(self, data1, data2):
743 ranked = rankdata(data1+data2) 744 rankx = ranked[0:self.d1.N] 745 ranky = ranked[self.d1.N:] 746 u1 = self.d1.N*self.d2.N+(self.d1.N*(self.d1.N+1))/2.0-reduce\ 747 (add, rankx) 748 u2 = self.d1.N*self.d2.N - u1 749 self.bigu = max(u1,u2) 750 self.smallu = min(u1,u2) 751 T = math.sqrt(tiecorrect(ranked)) 752 if T == 0: 753 return -1.0, -1.0 754 sd = math.sqrt(T*self.d1.N*self.d2.N*(self.d1.N+self.d2.N+1)/12.0) 755 self.z = abs((self.bigu-self.d1.N*self.d2.N/2.0) / sd) 756 self.prob = 1.0-zprob(self.z)
757
758 - def LinearRegression(self, x, y):
759 TINY = 1.0e-20 760 if (self.d1.N != self.d2.N): 761 self.prob = -1.0 762 else: 763 summult = reduce(add, map(multiply, x, y)) 764 r_num = float(self.d1.N*summult - self.d1.sum*self.d2.sum) 765 r_den = math.sqrt((self.d1.N*self.d1.sumsquares - \ 766 (self.d1.sum**2))*(self.d2.N* \ 767 self.d2.sumsquares - (self.d2.sum**2))) 768 try: 769 self.r = r_num / r_den 770 except ZeroDivisionError: 771 self.r = 0.0 772 z = 0.5*math.log((1.0+self.r+TINY)/(1.0-self.r+TINY)) 773 self.df = self.d1.N - 2 774 self.t = self.r*math.sqrt(self.df/((1.0-self.r+TINY)*(1.0+ \ 775 self.r+TINY))) 776 self.prob = betai(0.5*self.df,0.5,self.df/(self.df+self.t*self.t)) 777 self.slope = r_num / float(self.d1.N*self.d1.sumsquares - \ 778 (self.d1.sum**2)) 779 self.intercept = self.d2.mean - self.slope*self.d1.mean 780 self.sterrest = math.sqrt(1-self.r*self.r)*math.sqrt \ 781 (self.d2.variance)
782 783
784 - def PairedPermutation(self, x, y):
785 self.utail = 0 786 self.nperm = 0 787 self.crit = 0.0 788 d = [] 789 d.append(copy(x)) 790 d.append(copy(x)) 791 d.append(copy(y)) 792 index = [1]*self.d1.N 793 for i in range(self.d1.N): 794 d[1][i] = x[i]-y[i] 795 d[2][i] = y[i]-x[i] 796 self.crit = self.crit + d[1][i] 797 #for j in range((self.d1.N-1), 0, -1): 798 while 1: 799 sum = 0 800 for i in range(self.d1.N): 801 sum = sum + d[index[i]][i] 802 self.nperm = self.nperm + 1 803 if (sum >= self.crit): 804 self.utail = self.utail + 1 805 for i in range((self.d1.N-1), 0, -1): 806 if (index[i] == 1): 807 index[i] = 2 808 continue 809 index[i] = 1 810 break 811 self.prob = float(self.utail / self.nperm)
812 813 """ 814 def PointBiserialr(self, x, y): 815 TINY = 1e-30 816 if len(x) <> len(y): 817 return -1.0, -1.0 818 data = pstat.abut(x,y) 819 categories = pstat.unique(x) 820 if len(categories) <> 2: 821 return -1.0, -2.0 822 else: # there are 2 categories, continue 823 codemap = pstat.abut(categories,range(2)) 824 recoded = pstat.recode(data,codemap,0) 825 x = pstat.linexand(data,0,categories[0]) 826 y = pstat.linexand(data,0,categories[1]) 827 xmean = mean(pstat.colex(x,1)) 828 ymean = mean(pstat.colex(y,1)) 829 n = len(data) 830 adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n))) 831 rpb = (ymean - xmean)/samplestdev(pstat.colex(data,1))*adjust 832 df = n-2 833 t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) 834 prob = betai(0.5*df,0.5,df/(df+t*t)) # t already a float 835 return rpb, prob 836 """ 837
838 -class ThreeSampleTests:
839
840 - def __init__(self):
841 self.prob = -1.0
842
843 - def anovaWithin(self, inlist, ns, sums, means):
844 GN = 0 845 GS = 0.0 846 GM = 0.0 847 k = len(inlist) 848 meanlist = [] 849 Nlist = [] 850 for i in range(k): 851 GN = GN + ns[i] 852 GS = GS + sums[i] 853 Nlist.append(ns[i]) 854 meanlist.append(means[i]) 855 GM = GS / float(GN) 856 self.SSwit = 0.0 857 self.SSbet = 0.0 858 self.SStot = 0.0 859 for i in range(k): 860 for j in range(Nlist[i]): 861 diff = inlist[i][j] - meanlist[i] 862 self.SSwit = self.SSwit + (diff ** 2) 863 diff = inlist[i][j] - GM 864 self.SStot = self.SStot + (diff ** 2) 865 diff = meanlist[i] - GM 866 self.SSbet = self.SSbet + (diff ** 2) 867 self.SSbet = self.SSbet * float(GN / k) 868 self.SSint = 0.0 869 for j in range(ns[0]): 870 rowlist = [] 871 for i in range(k): 872 rowlist.append(inlist[i][j]) 873 n, sum, mean, SS = minimaldescriptives(rowlist) 874 self.SSint = self.SSint + ((mean - GM) ** 2) 875 self.SSint = self.SSint * k 876 self.SSres = self.SSwit - self.SSint 877 self.dfbet = k - 1 878 self.dfwit = GN - k 879 self.dfres = (ns[0] - 1) * (k - 1) 880 self.dftot = self.dfbet + self.dfwit + self.dfres 881 self.MSbet = self.SSbet / float(self.dfbet) 882 self.MSwit = self.SSwit / float(self.dfwit) 883 self.MSres = self.SSres / float(self.dfres) 884 self.F = self.MSbet / self.MSres 885 self.prob = fprob(self.dfbet, self.dfres, self.F)
886
887 - def anovaBetween(self, descs):
888 GN = 0 889 GS = 0.0 890 GM = 0.0 891 self.SSwit = 0.0 892 self.SSbet = 0.0 893 self.SStot = 0.0 894 k = len(descs) 895 for i in range(k): 896 self.SSwit = self.SSwit + descs[i].ssdevs 897 GN = GN + descs[i].N 898 GM = GM + descs[i].mean 899 GM = GM / k 900 for i in range(k): 901 self.SSbet = self.SSbet + ((descs[i].mean - GM) ** 2) 902 self.SSbet = self.SSbet * descs[0].N 903 self.SStot = self.SSwit + self.SSbet 904 self.dfbet = k - 1 905 self.dferr = GN - k 906 self.dftot = self.dfbet + self.dferr 907 self.MSbet = self.SSbet / float(self.dfbet) 908 self.MSerr = self.SSwit / float(self.dferr) 909 try: 910 self.F = self.MSbet / self.MSerr 911 except: 912 self.F = 1.0 913 self.prob = fprob(self.dfbet, self.dferr, self.F)
914
915 - def KruskalWallisH(self, args):
916 args = list(args) 917 n = [0]*len(args) 918 all = [] 919 n = map(len,args) 920 for i in range(len(args)): 921 all = all + args[i] 922 ranked = rankdata(all) 923 T = tiecorrect(ranked) 924 for i in range(len(args)): 925 args[i] = ranked[0:n[i]] 926 del ranked[0:n[i]] 927 rsums = [] 928 for i in range(len(args)): 929 rsums.append(sum(args[i])**2) 930 rsums[i] = rsums[i] / float(n[i]) 931 ssbn = sum(rsums) 932 totaln = sum(n) 933 self.h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1) 934 self.df = len(args) - 1 935 if T == 0: 936 self.h = 0.0 937 self.prob = 1.0 938 else: 939 self.h = self.h / float(T) 940 self.prob = chisqprob(self.h,self.df)
941
942 - def FriedmanChiSquare(self, args):
943 k = len(args) 944 n = len(args[0]) 945 data=[] 946 for j in range(len(args[0])): 947 line=[] 948 for i in range(len(args)): 949 line.append(args[i][j]) 950 data.append(line) 951 for i in range(len(data)): 952 data[i] = rankdata(data[i]) 953 data2 = [] 954 for j in range(len(data[0])): 955 line = [] 956 for i in range(len(data)): 957 line.append(data[i][j]) 958 data2.append(line) 959 self.sumranks = [] 960 for i in range(k): 961 x = FullDescriptives(data2[i]) 962 self.sumranks.append(x.sum) 963 ssbn = 0 964 sums = [] 965 for i in range(k): 966 tmp = sum(data2[i]) 967 ssbn = ssbn + (tmp ** 2) 968 sums.append(tmp/len(data2[i])) 969 self.chisq = (12.0 / (k*n*(k+1))) * ssbn - 3*n*(k+1) 970 self.df = k-1 971 self.prob = chisqprob(self.chisq,self.df)
972
973 - def CochranesQ(self, inlist):
974 k = len(inlist) 975 n = len(inlist[0]) 976 self.df = k - 1 977 gtot = 0 978 for i in range(k): 979 g = 0 980 for j in range(n): 981 g = g + inlist[i][j] 982 gtot = gtot + (g ** 2) 983 l = lsq = 0 984 for i in range(n): 985 rowsum = 0 986 for j in range(k): 987 rowsum = rowsum + inlist[j][i] 988 l = l + rowsum 989 lsq = lsq + (rowsum ** 2) 990 self.q = ((k-1)*((k*gtot)-(l**2)))/((k*l)-lsq) 991 self.prob = chisqprob(self.q, self.df)
992
993 -class FriedmanComp:
994 """This class performs multiple comparisons on a Freidmans 995 test. Passed values are the medians, k (# conditions), n 996 (# samples), and the alpha value. Currently, all comparisons 997 are performed regardless. Assumes a balanced design."""
998 - def __init__(self, medians, k, n, p):
999 crit = inversechi(p, k-1) 1000 value = crit * math.sqrt((k * (k + 1)) / (6 * n * k)) 1001 self.outstr = '<p>Multiple Comparisons for Friedmans test:</p>' 1002 self.outstr=self.outstr+'<br>Critical Value (>= for sig) = '+str(crit) 1003 for i in range(len(medians)): 1004 for j in range(i+1, len(medians)): 1005 if (i != j): 1006 self.outstr = self.outstr+'<br>'+str(i+1)+' against '+str(j+1) 1007 diff = abs(medians[i] - medians[j]) 1008 self.outstr = self.outstr+' = '+str(diff)
1009
1010 -class KWComp:
1011 """This class performs multiple comparisons on a Kruskal Wallis 1012 test. Passed values are the medians, k (# conditions), n 1013 (# samples), and the alpha value. Currently, all comparisons 1014 are performed regardless. Assumes a balanced design. 1015 Further note - not completed by any means! DO NO USE THIS YET!"""
1016 - def __init__(self, medians, k, n, p):
1017 crit = inversechi(p, k-1) 1018 value = crit * math.sqrt((k * (k + 1)) / (6 * n * k)) 1019 self.outstr = '<p>Multiple Comparisons for Friedmans test:</p>' 1020 self.outstr=self.outstr+'<br>Critical Value (>= for sig) = '+str(crit) 1021 for i in range(len(medians)): 1022 for j in range(i+1, len(medians)): 1023 if (i != j): 1024 self.outstr = self.outstr+'<br>'+str(i+1)+' against '+str(j+1) 1025 diff = abs(medians[i] - medians[j]) 1026 self.outstr = self.outstr+' = '+str(diff)
1027