summaryrefslogtreecommitdiff
path: root/structure/correlations/compute_tau.py
diff options
context:
space:
mode:
Diffstat (limited to 'structure/correlations/compute_tau.py')
-rw-r--r--structure/correlations/compute_tau.py133
1 files changed, 133 insertions, 0 deletions
diff --git a/structure/correlations/compute_tau.py b/structure/correlations/compute_tau.py
new file mode 100644
index 0000000..3d1f804
--- /dev/null
+++ b/structure/correlations/compute_tau.py
@@ -0,0 +1,133 @@
+####
+##
+## Take as input two files, whose n^th line contains the ranking of
+## element n, and compute the Kendall's \tau_b rank correlation
+## coefficient
+##
+##
+
+import sys
+from numpy import *
+
+
+def kendalltau(x,y):
+ initial_sort_with_lexsort = True # if True, ~30% slower (but faster under profiler!) but with better worst case (O(n log(n)) than (quick)sort (O(n^2))
+ n = len(x)
+ temp = range(n) # support structure used by mergesort
+ # this closure recursively sorts sections of perm[] by comparing
+ # elements of y[perm[]] using temp[] as support
+ # returns the number of swaps required by an equivalent bubble sort
+ def mergesort(offs, length):
+ exchcnt = 0
+ if length == 1:
+ return 0
+ if length == 2:
+ if y[perm[offs]] <= y[perm[offs+1]]:
+ return 0
+ t = perm[offs]
+ perm[offs] = perm[offs+1]
+ perm[offs+1] = t
+ return 1
+ length0 = length / 2
+ length1 = length - length0
+ middle = offs + length0
+ exchcnt += mergesort(offs, length0)
+ exchcnt += mergesort(middle, length1)
+ if y[perm[middle - 1]] < y[perm[middle]]:
+ return exchcnt
+ # merging
+ i = j = k = 0
+ while j < length0 or k < length1:
+ if k >= length1 or (j < length0 and y[perm[offs + j]] <= y[perm[middle + k]]):
+ temp[i] = perm[offs + j]
+ d = i - j
+ j += 1
+ else:
+ temp[i] = perm[middle + k]
+ d = (offs + i) - (middle + k)
+ k += 1
+ if d > 0:
+ exchcnt += d;
+ i += 1
+ perm[offs:offs+length] = temp[0:length]
+ return exchcnt
+
+ # initial sort on values of x and, if tied, on values of y
+ if initial_sort_with_lexsort:
+ # sort implemented as mergesort, worst case: O(n log(n))
+ perm = lexsort((y, x))
+ else:
+ # sort implemented as quicksort, 30% faster but with worst case: O(n^2)
+ perm = range(n)
+ perm.sort(lambda a,b: cmp(x[a],x[b]) or cmp(y[a],y[b]))
+
+ # compute joint ties
+ first = 0
+ t = 0
+ for i in xrange(1,n):
+ if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]:
+ t += ((i - first) * (i - first - 1)) / 2
+ first = i
+ t += ((n - first) * (n - first - 1)) / 2
+
+ # compute ties in x
+ first = 0
+ u = 0
+ for i in xrange(1,n):
+ if x[perm[first]] != x[perm[i]]:
+ u += ((i - first) * (i - first - 1)) / 2
+ first = i
+ u += ((n - first) * (n - first - 1)) / 2
+
+ # count exchanges
+ exchanges = mergesort(0, n)
+ # compute ties in y after mergesort with counting
+ first = 0
+ v = 0
+ for i in xrange(1,n):
+ if y[perm[first]] != y[perm[i]]:
+ v += ((i - first) * (i - first - 1)) / 2
+ first = i
+ v += ((n - first) * (n - first - 1)) / 2
+
+ tot = (n * (n - 1)) / 2
+ if tot == u and tot == v:
+ return 1 # Special case for all ties in both ranks
+
+ tau = ((tot-(v+u-t)) - 2.0 * exchanges) / (sqrt(float(( tot - u )) * float( tot - v )))
+
+ # what follows reproduces ending of Gary Strangman's original stats.kendalltau() in SciPy
+ svar = (4.0*n+10.0) / (9.0*n*(n-1))
+ z = tau / sqrt(svar)
+ ##prob = erfc(abs(z)/1.4142136)
+ ##return tau, prob
+ return tau
+
+def main():
+
+ if len(sys.argv) < 3:
+ print "Usage: %s <file1> <file2>" % sys.argv[0]
+ sys.exit(1)
+
+ x1 = []
+ x2= []
+
+ lines = open(sys.argv[1]).readlines()
+
+ for l in lines:
+ elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0]
+ x1.append(elem)
+
+ lines = open(sys.argv[2]).readlines()
+
+ for l in lines:
+ elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0]
+ x2.append(elem)
+
+
+ tau = kendalltau(x1,x2)
+ print tau
+
+
+if __name__ == "__main__":
+ main()