summaryrefslogtreecommitdiff
path: root/structure/correlations/compute_rho.py
blob: 8fa417496f59906dfc3f6e40ec7c6d90caa1b41a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
####
##
## Get two rankings and a pairing, and compute the corresponding
## Spearman's rho rank correlation coefficient
##
##
##
##

import sys
import random
import math
#import scipy.stats

##
## Compute the constant C of the Spearman's rho correlation coefficient in the draft
## 
def compute_C(rank1, rank2):
    N = len(rank1)
    [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]]
    [sum1, sum2] = [1.0 * sum(x.values()) for x in [rank1, rank2]]
    C = N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2

    #print mu1, mu2, sum1, sum2, C

    return C


##
## Compute the constant D of the Spearman's rho correlation coefficient in the draft
## 
def compute_D(rank1, rank2):
    [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]]
    s1 = sum([math.pow((x-mu1), 2) for x in rank1.values()])
    s2 = sum([math.pow((x-mu2), 2) for x in rank2.values()])
    
    D = math.sqrt(s1 * s2)
    return D

def compute_rho(rank1, rank2, pairing, C, D):

    rho = 0
    for s,t in pairing:
        rho += rank1[s] * rank2[t]
    rho = (rho + C) / D
    return rho

if len(sys.argv) < 3:
    print "Usage: %s <rank1> <rank2> [<pairing>]" % sys.argv[0]
    sys.exit(1)

rank1 = {}
rank2 = {}

lines = open(sys.argv[1], "r").readlines()

i = 0
for l in lines:
    if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines
        continue
    r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
    rank1[i] = r
    i += 1


lines = open(sys.argv[2], "r").readlines()

i = 0
for l in lines:
    if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines
        continue
    r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
    rank2[i] = r
    i += 1


N1 = len(rank1)
N2 = len(rank2)

if (N1 != N2):
    print "The two files must have the same number of nodes!!!!!"
    sys.exit(2)



C = compute_C(rank1, rank2)
D = compute_D(rank1, rank2)


## We start from a random pairing, and compute the corresponding value of rho

pairing = []

if len(sys.argv) > 3:
    lines = open(sys.argv[3], "r").readlines()
    
    

    for l in lines:
        if l[0] == "#" or l.strip(" \n").split(" ") == []:
            continue
        p1, p2 = [int(x) for x in l.strip(" \n").split(" ")][:2]
        pairing.append((p1, p2))

else:
    for i in range (N1):
        pairing.append((i,i))


if len(pairing) != N1:
    print "ERROR !!! The pairing should be of the same length of the ranking files!!!"
    sys.exit(1)

rho = compute_rho(rank1, rank2, pairing, C, D)

print rho