summaryrefslogtreecommitdiff
path: root/structure/correlations
diff options
context:
space:
mode:
Diffstat (limited to 'structure/correlations')
-rw-r--r--structure/correlations/Makefile15
-rw-r--r--structure/correlations/compute_pearson.py33
-rw-r--r--structure/correlations/compute_rho.py118
-rw-r--r--structure/correlations/compute_tau.py133
-rw-r--r--structure/correlations/dump_k_q.c50
-rw-r--r--structure/correlations/fit_knn.c59
-rw-r--r--structure/correlations/fit_utils.c382
-rw-r--r--structure/correlations/fit_utils.h25
-rw-r--r--structure/correlations/knn_q_from_degrees.py49
-rw-r--r--structure/correlations/knn_q_from_layers.py98
-rw-r--r--structure/correlations/rank_nodes.py74
-rw-r--r--structure/correlations/rank_nodes_thresh.py87
-rw-r--r--structure/correlations/rank_occurrence.py45
-rw-r--r--structure/correlations/rank_utils.c217
-rw-r--r--structure/correlations/rank_utils.h44
15 files changed, 1429 insertions, 0 deletions
diff --git a/structure/correlations/Makefile b/structure/correlations/Makefile
new file mode 100644
index 0000000..58301cb
--- /dev/null
+++ b/structure/correlations/Makefile
@@ -0,0 +1,15 @@
+CFLAGS="-O3"
+CC="gcc"
+GSL_FLAGS=-lgsl -lgslcblas -lm
+MFLAG=-lm
+
+all: dump_k_q fit_knn
+
+fit_knn: fit_knn.c fit_utils.c
+ $(CC) $(CFLAGS) -o fit_knn fit_knn.c fit_utils.c $(GSL_FLAGS)
+
+dump_k_q: dump_k_q.c rank_utils.c
+ $(CC) $(CFLAGS) -o dump_k_q dump_k_q.c rank_utils.c $(MFLAG)
+
+clean:
+ rm dump_k_q fit_knn
diff --git a/structure/correlations/compute_pearson.py b/structure/correlations/compute_pearson.py
new file mode 100644
index 0000000..e2c9055
--- /dev/null
+++ b/structure/correlations/compute_pearson.py
@@ -0,0 +1,33 @@
+####
+##
+## Compute the pearson correlation coefficient between the values of
+## node properties included in the two files provided as input.
+##
+
+import sys
+import numpy
+import scipy.stats
+import math
+
+if len(sys.argv) < 3:
+ print "Usage %s <file1> <file2>" % sys.argv[0]
+ sys.exit(1)
+
+
+x1 = []
+
+with open(sys.argv[1], "r") as lines:
+ for l in lines:
+ elem = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split()][0]
+ x1.append(elem)
+
+x2 = []
+
+with open(sys.argv[2], "r") as lines:
+ for l in lines:
+ elem = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split()][0]
+ x2.append(elem)
+
+
+r2 =numpy.corrcoef(x1,x2)
+print r2[0][1]
diff --git a/structure/correlations/compute_rho.py b/structure/correlations/compute_rho.py
new file mode 100644
index 0000000..8fa4174
--- /dev/null
+++ b/structure/correlations/compute_rho.py
@@ -0,0 +1,118 @@
+####
+##
+## Get two rankings and a pairing, and compute the corresponding
+## Spearman's rho rank correlation coefficient
+##
+##
+##
+##
+
+import sys
+import random
+import math
+#import scipy.stats
+
+##
+## Compute the constant C of the Spearman's rho correlation coefficient in the draft
+##
+def compute_C(rank1, rank2):
+ N = len(rank1)
+ [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]]
+ [sum1, sum2] = [1.0 * sum(x.values()) for x in [rank1, rank2]]
+ C = N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2
+
+ #print mu1, mu2, sum1, sum2, C
+
+ return C
+
+
+##
+## Compute the constant D of the Spearman's rho correlation coefficient in the draft
+##
+def compute_D(rank1, rank2):
+ [mu1, mu2] = [1.0 * sum(x.values()) / len(x.values()) for x in [rank1, rank2]]
+ s1 = sum([math.pow((x-mu1), 2) for x in rank1.values()])
+ s2 = sum([math.pow((x-mu2), 2) for x in rank2.values()])
+
+ D = math.sqrt(s1 * s2)
+ return D
+
+def compute_rho(rank1, rank2, pairing, C, D):
+
+ rho = 0
+ for s,t in pairing:
+ rho += rank1[s] * rank2[t]
+ rho = (rho + C) / D
+ return rho
+
+if len(sys.argv) < 3:
+ print "Usage: %s <rank1> <rank2> [<pairing>]" % sys.argv[0]
+ sys.exit(1)
+
+rank1 = {}
+rank2 = {}
+
+lines = open(sys.argv[1], "r").readlines()
+
+i = 0
+for l in lines:
+ if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines
+ continue
+ r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+ rank1[i] = r
+ i += 1
+
+
+lines = open(sys.argv[2], "r").readlines()
+
+i = 0
+for l in lines:
+ if l[0] == "#" or l.strip(" \n").split(" ") == []: ## Strip comments and empty lines
+ continue
+ r = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+ rank2[i] = r
+ i += 1
+
+
+N1 = len(rank1)
+N2 = len(rank2)
+
+if (N1 != N2):
+ print "The two files must have the same number of nodes!!!!!"
+ sys.exit(2)
+
+
+
+C = compute_C(rank1, rank2)
+D = compute_D(rank1, rank2)
+
+
+## We start from a random pairing, and compute the corresponding value of rho
+
+pairing = []
+
+if len(sys.argv) > 3:
+ lines = open(sys.argv[3], "r").readlines()
+
+
+
+ for l in lines:
+ if l[0] == "#" or l.strip(" \n").split(" ") == []:
+ continue
+ p1, p2 = [int(x) for x in l.strip(" \n").split(" ")][:2]
+ pairing.append((p1, p2))
+
+else:
+ for i in range (N1):
+ pairing.append((i,i))
+
+
+if len(pairing) != N1:
+ print "ERROR !!! The pairing should be of the same length of the ranking files!!!"
+ sys.exit(1)
+
+rho = compute_rho(rank1, rank2, pairing, C, D)
+
+print rho
+
+
diff --git a/structure/correlations/compute_tau.py b/structure/correlations/compute_tau.py
new file mode 100644
index 0000000..3d1f804
--- /dev/null
+++ b/structure/correlations/compute_tau.py
@@ -0,0 +1,133 @@
+####
+##
+## Take as input two files, whose n^th line contains the ranking of
+## element n, and compute the Kendall's \tau_b rank correlation
+## coefficient
+##
+##
+
+import sys
+from numpy import *
+
+
+def kendalltau(x,y):
+ initial_sort_with_lexsort = True # if True, ~30% slower (but faster under profiler!) but with better worst case (O(n log(n)) than (quick)sort (O(n^2))
+ n = len(x)
+ temp = range(n) # support structure used by mergesort
+ # this closure recursively sorts sections of perm[] by comparing
+ # elements of y[perm[]] using temp[] as support
+ # returns the number of swaps required by an equivalent bubble sort
+ def mergesort(offs, length):
+ exchcnt = 0
+ if length == 1:
+ return 0
+ if length == 2:
+ if y[perm[offs]] <= y[perm[offs+1]]:
+ return 0
+ t = perm[offs]
+ perm[offs] = perm[offs+1]
+ perm[offs+1] = t
+ return 1
+ length0 = length / 2
+ length1 = length - length0
+ middle = offs + length0
+ exchcnt += mergesort(offs, length0)
+ exchcnt += mergesort(middle, length1)
+ if y[perm[middle - 1]] < y[perm[middle]]:
+ return exchcnt
+ # merging
+ i = j = k = 0
+ while j < length0 or k < length1:
+ if k >= length1 or (j < length0 and y[perm[offs + j]] <= y[perm[middle + k]]):
+ temp[i] = perm[offs + j]
+ d = i - j
+ j += 1
+ else:
+ temp[i] = perm[middle + k]
+ d = (offs + i) - (middle + k)
+ k += 1
+ if d > 0:
+ exchcnt += d;
+ i += 1
+ perm[offs:offs+length] = temp[0:length]
+ return exchcnt
+
+ # initial sort on values of x and, if tied, on values of y
+ if initial_sort_with_lexsort:
+ # sort implemented as mergesort, worst case: O(n log(n))
+ perm = lexsort((y, x))
+ else:
+ # sort implemented as quicksort, 30% faster but with worst case: O(n^2)
+ perm = range(n)
+ perm.sort(lambda a,b: cmp(x[a],x[b]) or cmp(y[a],y[b]))
+
+ # compute joint ties
+ first = 0
+ t = 0
+ for i in xrange(1,n):
+ if x[perm[first]] != x[perm[i]] or y[perm[first]] != y[perm[i]]:
+ t += ((i - first) * (i - first - 1)) / 2
+ first = i
+ t += ((n - first) * (n - first - 1)) / 2
+
+ # compute ties in x
+ first = 0
+ u = 0
+ for i in xrange(1,n):
+ if x[perm[first]] != x[perm[i]]:
+ u += ((i - first) * (i - first - 1)) / 2
+ first = i
+ u += ((n - first) * (n - first - 1)) / 2
+
+ # count exchanges
+ exchanges = mergesort(0, n)
+ # compute ties in y after mergesort with counting
+ first = 0
+ v = 0
+ for i in xrange(1,n):
+ if y[perm[first]] != y[perm[i]]:
+ v += ((i - first) * (i - first - 1)) / 2
+ first = i
+ v += ((n - first) * (n - first - 1)) / 2
+
+ tot = (n * (n - 1)) / 2
+ if tot == u and tot == v:
+ return 1 # Special case for all ties in both ranks
+
+ tau = ((tot-(v+u-t)) - 2.0 * exchanges) / (sqrt(float(( tot - u )) * float( tot - v )))
+
+ # what follows reproduces ending of Gary Strangman's original stats.kendalltau() in SciPy
+ svar = (4.0*n+10.0) / (9.0*n*(n-1))
+ z = tau / sqrt(svar)
+ ##prob = erfc(abs(z)/1.4142136)
+ ##return tau, prob
+ return tau
+
+def main():
+
+ if len(sys.argv) < 3:
+ print "Usage: %s <file1> <file2>" % sys.argv[0]
+ sys.exit(1)
+
+ x1 = []
+ x2= []
+
+ lines = open(sys.argv[1]).readlines()
+
+ for l in lines:
+ elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0]
+ x1.append(elem)
+
+ lines = open(sys.argv[2]).readlines()
+
+ for l in lines:
+ elem = [float(x) if "e" in x or "." in x else int(x) for x in l.strip(" \n").split()][0]
+ x2.append(elem)
+
+
+ tau = kendalltau(x1,x2)
+ print tau
+
+
+if __name__ == "__main__":
+ main()
diff --git a/structure/correlations/dump_k_q.c b/structure/correlations/dump_k_q.c
new file mode 100644
index 0000000..2b6ba79
--- /dev/null
+++ b/structure/correlations/dump_k_q.c
@@ -0,0 +1,50 @@
+/**
+ *
+ * Get the degree distributions of two layers and a pairing, and dump
+ * on output the pairs k-q
+ *
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+void dump_k_q(double *R1, double *R2, int N, int *pairing){
+
+ int i;
+
+ for (i=0; i<N; i++){
+ printf("%g %g\n", R1[i], R2[pairing[i]]);
+ }
+}
+
+
+int main(int argc, char *argv[]){
+
+ int N1, N2;
+ double *R1 = NULL;
+ double *R2 = NULL;
+
+ int *pairing = NULL;
+
+
+ if (argc < 4){
+ printf("Usage: %s <degs1> <degs2> <pairing>\n", argv[0]);
+ exit(1);
+ }
+
+ load_ranking(argv[1], &N1, &R1);
+ load_ranking(argv[2], &N2, &R2);
+
+ if (N1 != N2){
+ printf("Error!!!! The two files must have the same number of lines!!!! Exiting...\n");
+ exit(1);
+ }
+
+ pairing = malloc(N1 * sizeof(int));
+
+ load_pairing(&pairing, N1, argv[3]);
+
+ dump_k_q(R1, R2, N1, pairing);
+
+}
diff --git a/structure/correlations/fit_knn.c b/structure/correlations/fit_knn.c
new file mode 100644
index 0000000..64fc4c3
--- /dev/null
+++ b/structure/correlations/fit_knn.c
@@ -0,0 +1,59 @@
+/**
+ *
+ * Take a file which contains pairs in the form:
+ *
+ * k q
+ *
+ * compute qnn(k) and fit this function with a power-law
+ *
+ * a*x^{b}
+ *
+ * The fit is actually performed as a linear fit on the
+ * exponential-binned log-log distribution
+ *
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <gsl/gsl_fit.h>
+#include "fit_utils.h"
+
+
+
+int main(int argc, char *argv[]){
+
+ double *k, *q, *x, *y;
+ int N, num;
+ double alpha;
+
+ double c0, c1, cov00, cov01, cov11, sqsum;
+
+ k=q=x=y=NULL;
+
+ if (argc < 3){
+ printf("%s <filein> <alpha>\n", argv[0]);
+ exit(1);
+ }
+
+ alpha=atof(argv[2]);
+
+ fprintf(stderr,"Alpha: %g\n", alpha);
+
+ load_sequence_col(argv[1], &k, &N, 0);
+ load_sequence_col(argv[1], &q, &N, 1);
+ //exit(1);
+ exp_bin_avg(k, q, N, alpha, &x, &y, &num);
+ //normalize_distr(x, y, num);
+ //dump_distr(x, y, num);
+ compact_distr(x, y, &num);
+ // dump_distr(x,y,num);
+ map_vec(x, num, log);
+ map_vec(y, num, log);
+ //dump_distr(x,y,num);
+
+ gsl_fit_linear(x, 1, y, 1, num, &c0, &c1, &cov00, &cov01, &cov11, &sqsum);
+ fprintf(stderr, "a: %g b: %g\n", exp(c0), c1);
+}
diff --git a/structure/correlations/fit_utils.c b/structure/correlations/fit_utils.c
new file mode 100644
index 0000000..e9e3954
--- /dev/null
+++ b/structure/correlations/fit_utils.c
@@ -0,0 +1,382 @@
+/**
+ *
+ * Fit a given sequence with a power-law function
+ *
+ * a*x^{b}
+ *
+ * The fit is actually performed as a linear fit on the
+ * exponential-binned log-log distribution
+ *
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <gsl/gsl_fit.h>
+
+/**
+ *
+ * Load a sequence from a file, which contains one element on each line
+ *
+ */
+
+void load_sequence(char *fname, double **v, int *N){
+
+ int size;
+ char buff[256];
+
+ FILE *f;
+
+ f = fopen(fname, "r");
+ if (!f){
+ fprintf(stderr, "Error opening file %s!!!! Exiting...\n", fname);
+ exit(1);
+ }
+
+ *N =0;
+ size = 10;
+ if (*v == NULL)
+ *v = malloc(size * sizeof(double));
+ else{
+ *v = realloc(*v, size * sizeof(double));
+ }
+
+ while (fgets(buff, 255, f)){
+ (*v)[*N] = atof(buff);
+ *N += 1;
+ if (*N == size){
+ size += 10;
+ *v = realloc(*v, size * sizeof(double));
+ }
+ }
+ *v = realloc(*v, (*N) * sizeof(double));
+ fclose(f);
+}
+
+
+/**
+ *
+ * Load a sequence, getting the "col"-th column of the input file
+ *
+ */
+
+void load_sequence_col(char *fname, double **v, int *N, int col){
+
+ int size, n;
+ char buff[256];
+ char *ptr;
+
+ FILE *f;
+
+ f = fopen(fname, "r");
+ if (!f){
+ fprintf(stderr, "Error opening file %s!!!! Exiting...\n", fname);
+ exit(1);
+ }
+
+ *N =0;
+ size = 10;
+ if (*v == NULL)
+ *v = malloc(size * sizeof(double));
+ else{
+ *v = realloc(*v, size * sizeof(double));
+ }
+
+ while (fgets(buff, 255, f)){
+ ptr = strtok(buff, " ");
+ if (col > 0){
+ n = 0;
+ while (n<col){
+ ptr = strtok(NULL, " ");
+ n += 1;
+ }
+ }
+ (*v)[*N] = atof(ptr);
+ *N += 1;
+
+ if (*N == size){
+ size += 10;
+ *v = realloc(*v, size * sizeof(double));
+ }
+ }
+ *v = realloc(*v, (*N) * sizeof(double));
+ fclose(f);
+}
+
+
+/**
+ *
+ * Make the exponential binning of a distribution, with a giving
+ * exponent alpha. The value of y[i] is the number of elements of v
+ * whose value lies between x[i-1] and x[i]...
+ *
+ */
+
+void exp_bin_cnt(double *v, int N, double alpha, double **x, double **y, int *num){
+
+ double min_v, max_v, val, last_val;
+ int i, j, size, num_x;
+ double last_size, new_size;
+
+ min_v = max_v = v[0];
+
+ for (i=1; i<N; i ++){
+ if (v[i] > max_v)
+ max_v = v[i];
+ else if (v[i] > 0 && v[i] < min_v)
+ min_v = v[i];
+ }
+
+ size = 10;
+ if (*x == NULL){
+ *x = malloc(size * sizeof(double));
+ }
+ else{
+ *x = realloc(*x, size * sizeof(double));
+ }
+
+ val = min_v;
+ last_size = min_v;
+ (*x)[0] = min_v;
+ num_x = 1;
+
+ while(val < max_v){
+ new_size = last_size * alpha;
+ val = last_size + new_size;
+ last_size = new_size;
+ last_val = val;
+ (*x)[num_x] = val;
+ num_x += 1;
+ if (num_x == size){
+ size += 10;
+ *x = realloc(*x, size * sizeof(double));
+ }
+ }
+
+ if (*y == NULL){
+ *y = malloc(num_x * sizeof(double));
+ }
+ else{
+ *y = realloc(*y, num_x * sizeof(double));
+ }
+ for (i=0; i < num_x; i++){
+ (*y)[i] = 0;
+ }
+
+
+
+ for(i=0; i <N; i ++){
+ j = 0;
+ while(v[i] > (*x)[j]){
+ j ++;
+ }
+ (*y)[j] += 1;
+ }
+ *num = num_x;
+}
+
+/**
+ *
+ * Make the exponential binning of a distribution, with a giving
+ * exponent alpha. The value of y[i] is the average of the values in
+ * the vector "w" whose corresponding v lies between x[i-1] and x[i]...
+ *
+ */
+
+
+void exp_bin_avg(double *v, double *w, int N, double alpha, double **x, double **y, int *num){
+
+ double min_v, max_v, val, last_val;
+ int i, j, size, num_x;
+ double last_size, new_size;
+ int *cnt;
+
+
+ min_v = max_v = v[0];
+
+ for (i=1; i<N; i ++){
+ if (v[i] > max_v)
+ max_v = v[i];
+ else if (v[i] > 0 && v[i] < min_v)
+ min_v = v[i];
+ }
+
+ size = 10;
+ if (*x == NULL){
+ *x = malloc(size * sizeof(double));
+ }
+ else{
+ *x = realloc(*x, size * sizeof(double));
+ }
+
+ val = min_v;
+ last_size = min_v;
+ (*x)[0] = min_v;
+ num_x = 1;
+
+ while(val < max_v){
+ new_size = last_size * alpha;
+ val = last_size + new_size;
+ last_size = new_size;
+ last_val = val;
+ (*x)[num_x] = val;
+ num_x += 1;
+ if (num_x == size){
+ size += 10;
+ *x = realloc(*x, size * sizeof(double));
+ }
+ }
+
+
+ cnt = malloc(num_x * sizeof(int));
+
+ if (*y == NULL){
+ *y = malloc(num_x * sizeof(double));
+ }
+ else{
+ *y = realloc(*y, num_x * sizeof(double));
+ }
+ for (i=0; i < num_x; i++){
+ (*y)[i] = 0;
+ cnt[i] = 0;
+ }
+
+ for(i=0; i <N; i ++){
+ j = 0;
+ while(j < num_x && v[i] > (*x)[j]){
+ j ++;
+ }
+ if(j == num_x){
+ printf("Error!!!!! Trying to assing a non-existing bin!!! -- fit_utils.c:exp_bin_avg!!!\n");
+ exit(37);
+ }
+ (*y)[j] += w[i];
+ cnt[j] += 1;
+ }
+ *num = num_x;
+
+ for(i=0; i< num_x; i++){
+ if (cnt[i] > 0){
+ (*y)[i] = (*y)[i] / cnt[i];
+ }
+ }
+ free(cnt);
+}
+
+
+/**
+ *
+ * Print a distribution on stdout
+ *
+ */
+
+void dump_distr(double *x, double *y, int N){
+ int i;
+
+ for(i=0; i<N; i ++){
+ printf("%g %g\n", x[i], y[i]);
+ }
+
+}
+
+
+/**
+ * Compact a distribution, leaving only the pairs (x_i, y_i) for which
+ * y_i > 0
+ *
+ */
+
+void compact_distr(double *x, double *y, int *num){
+
+ int i, j;
+
+ i = j = 0;
+ while(j < *num){
+ while(j < *num && y[j] == 0){
+ j ++;
+ }
+ if (j==*num){
+ break;
+ }
+ x[i] = x[j];
+ y[i] = y[j];
+ i ++;
+ j ++;
+ }
+ *num = i;
+}
+
+
+/**
+ *
+ * Apply the function "f" on all the elemnts of a vector, in-place
+ *
+ */
+
+void map_vec(double *v, int N, double (*f)(double)){
+ int i;
+
+ for (i=0; i<N; i ++){
+ v[i] = f(v[i]);
+ }
+}
+
+
+/**
+ *
+ * Normalize a distribution, dividing each y[i] for the width of the
+ * corresponding bin (i.e., x[i] - x[i-1])
+ *
+ */
+void normalize_distr(double *x, double *y, int num){
+
+ int i;
+
+ for(i=1; i<num; i ++){
+ y[i] /= (x[i] - x[i-1]);
+ }
+}
+
+/**
+ *
+ * take two vectors (k and q) and a pairing, and compute the best
+ * power-law fit "a*k^{mu}" of qnn(k)
+ *
+ */
+
+void fit_current_trend(double *k, double *q, int N, int *pairing, double *mu, double *a,
+ double *corr){
+
+ static int *num;
+ static double *q_pair, *x, *y;
+
+ int i;
+ int fit_num;
+ double c0, c1, cov00, cov01, cov11, sqsum;
+
+ if (q_pair == NULL){
+ q_pair = malloc(N * sizeof(double));
+ }
+
+ for(i=0; i<N; i++){
+ q_pair[i] = q[pairing[i]];
+ }
+
+ exp_bin_avg(k, q_pair, N, 1.3, &x, &y, &fit_num);
+ //normalize_distr(x,y, fit_num);
+ compact_distr(x, y, &fit_num);
+
+ map_vec(x, fit_num, log);
+ map_vec(y, fit_num, log);
+ gsl_fit_linear(x, 1, y, 1, fit_num, &c0, &c1, &cov00, &cov01, &cov11, &sqsum);
+
+ //fprintf(stderr,"cov00: %g cov01: %g cov11: %g\n", cov00, cov01, cov11);
+ //fprintf(stderr, "corr: %g ", cov01 / sqrt(cov00 * cov11));
+ *a = c0;
+ *mu = c1;
+ *corr = cov01/sqrt(cov00 * cov11);
+}
+
diff --git a/structure/correlations/fit_utils.h b/structure/correlations/fit_utils.h
new file mode 100644
index 0000000..183f47c
--- /dev/null
+++ b/structure/correlations/fit_utils.h
@@ -0,0 +1,25 @@
+#ifndef __FIT_UTILS_H__
+#define __FIT_UTILS_H__
+
+
+void load_sequence(char *fname, double **v, int *N);
+
+void load_sequence_col(char *fname, double **v, int *N, int col);
+
+void exp_bin_cnt(double *v, int N, double alpha, double **x, double **y, int *num);
+
+void exp_bin_avg(double *v, double *w, int N, double alpha, double **x, double **y, int *num);
+
+void dump_distr(double *x, double *y, int N);
+
+void compact_distr(double *x, double *y, int *num);
+
+void map_vec(double *v, int N, double (*f)(double));
+
+void normalize_distr(double *x, double *y, int num);
+
+void fit_current_trend(double *k, double *q, int N, int *pairing,
+ double *mu, double *a, double *corr);
+
+
+#endif /* __FIT_UTILS_H__ */
diff --git a/structure/correlations/knn_q_from_degrees.py b/structure/correlations/knn_q_from_degrees.py
new file mode 100644
index 0000000..5e10bfa
--- /dev/null
+++ b/structure/correlations/knn_q_from_degrees.py
@@ -0,0 +1,49 @@
+####
+##
+##
+## Take a file with the degree sequence of nodes which are active on
+## both layers, and compute \overline{k}(q) and \overline{q}(k),
+## i.e. the two inter-layer correlation functions, where we call "k"
+## the degree on the first column, and "q" the degree on the second
+## column
+##
+##
+
+import sys
+
+if len(sys.argv) < 2:
+ print "Usage: %s <filein>" % sys.argv[0]
+ sys.exit(1)
+
+qnn_k = {}
+knn_q = {}
+
+with open(sys.argv[1]) as f:
+ for l in f:
+ k, q = [int(x) for x in l.strip(" \n").split(" ")]
+ if qnn_k.has_key(k):
+ qnn_k[k].append(q)
+ else:
+ qnn_k[k] = [q]
+ if knn_q.has_key(q):
+ knn_q[q].append(k)
+ else:
+ knn_q[q] = [k]
+
+
+
+
+keys= qnn_k.keys()
+keys.sort()
+for k in keys:
+ print k, 1.0 * sum(qnn_k[k])/len(qnn_k[k])
+
+
+
+
+keys= knn_q.keys()
+keys.sort()
+for q in keys:
+ sys.stderr.write("%d %f\n" % (q, 1.0 * sum(knn_q[q])/len(knn_q[q])))
+
+
diff --git a/structure/correlations/knn_q_from_layers.py b/structure/correlations/knn_q_from_layers.py
new file mode 100644
index 0000000..c108c18
--- /dev/null
+++ b/structure/correlations/knn_q_from_layers.py
@@ -0,0 +1,98 @@
+####
+##
+##
+## Compute the degree-degree correlations of a multiplex graph, namely:
+##
+## <k_1>(k_2) and <k_2>(k_1)
+##
+## Takes as input the two lists of edges corresponding to each layer
+##
+
+import sys
+import numpy as np
+import networkx as net
+
+
+def knn(G, n):
+ neigh = G.neighbors(n)
+ l = G.degree(neigh).values()
+ return 1.0 * sum(l) / len(l)
+
+
+if len(sys.argv) < 2:
+ print "Usage: %s <layer1> <layer2>" % sys.argv[0]
+ sys.exit(1)
+
+
+G1 = net.read_edgelist(sys.argv[1])
+G2 = net.read_edgelist(sys.argv[2])
+
+k1_k1 = {} ## Intraleyer knn (k1)
+k2_k2 = {} ## Intralayer knn (k2)
+k1_k2 = {} ## Interlayer average degree at layer 1 of a node having degree k_2 in layer 2
+k2_k1 = {} ## Interlayer average degree at layer 2 of a node having degree k_1 in layer 1
+
+
+for n in G1.nodes():
+ k1 = G1.degree(n)
+
+
+ ##print k1,k2
+
+ knn1 = knn(G1, n)
+ if n in G2.nodes():
+ k2 = G2.degree(n)
+ knn2 = knn(G2, n)
+ else:
+ k2 = 0
+ knn2 = 0
+
+ if k1_k1.has_key(k1):
+ k1_k1[k1].append(knn1)
+ else:
+ k1_k1[k1] = [knn1]
+
+ if k2_k2.has_key(k2):
+ k2_k2[k2].append(knn2)
+ else:
+ k2_k2[k2] = [knn2]
+
+
+ if k1_k2.has_key(k2):
+ k1_k2[k2].append(k1)
+ else:
+ k1_k2[k2] = [k1]
+
+ if k2_k1.has_key(k1):
+ k2_k1[k1].append(k2)
+ else:
+ k2_k1[k1] = [k2]
+
+
+k1_keys = k1_k1.keys()
+k1_keys.sort()
+k2_keys = k2_k2.keys()
+k2_keys.sort()
+
+
+f = open("%s_%s_k1" % (sys.argv[1], sys.argv[2]), "w+")
+
+for n in k1_keys:
+ avg_knn = np.mean(k1_k1[n])
+ std_knn = np.std(k1_k1[n])
+ avg_k2 = np.mean(k2_k1[n])
+ std_k2 = np.std(k2_k1[n])
+ f.write("%d %f %f %f %f\n" % (n, avg_knn, std_knn, avg_k2, std_k2))
+
+f.close()
+
+f = open("%s_%s_k2" % (sys.argv[1], sys.argv[2]), "w+")
+
+for n in k2_keys:
+ avg_knn = np.mean(k2_k2[n])
+ std_knn = np.std(k2_k2[n])
+ avg_k1 = np.mean(k1_k2[n])
+ std_k1 = np.std(k1_k2[n])
+ f.write("%d %f %f %f %f\n" % (n, avg_knn, std_knn, avg_k1, std_k1))
+f.close()
+
diff --git a/structure/correlations/rank_nodes.py b/structure/correlations/rank_nodes.py
new file mode 100644
index 0000000..6985e88
--- /dev/null
+++ b/structure/correlations/rank_nodes.py
@@ -0,0 +1,74 @@
+####
+##
+##
+## Get a file as input, whose n-th line corresponds to the value of a
+## certain property of node n, and rank nodes according to their
+## properties, taking into account ranking ties properly.
+##
+## The output is a file whose n-th line is the "ranking" of the n-th
+## node according to the given property. (notice that rankings could
+## be fractional, due to the tie removal algorithm)
+##
+##
+
+
+import sys
+import math
+
+
+if len(sys.argv) < 2:
+ print "Usage: %s <filein>" % sys.argv[0]
+ sys.exit(1)
+
+
+
+lines = open(sys.argv[1], "r").readlines()
+
+ranking = []
+
+n=0
+for l in lines:
+ if l[0] == "#" or l.strip(" \n").split(" ") == []:
+ continue
+ v = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+ ranking.append((v,n))
+ n +=1
+
+ranking.sort(reverse=True)
+#print ranking
+new_ranking = {}
+
+v0, n0 = ranking[0]
+
+old_value = v0
+tot_rankings = 1
+
+stack = [n0]
+l=1.0
+for v,n in ranking[1:]:
+ l += 1
+ ##print stack, tot_rankings
+ if v != old_value: ### There is a new rank
+ # We first compute the rank for all the nodes in the stack and then we set it
+ new_rank_value = 1.0 * tot_rankings / len(stack)
+ ##print new_rank_value
+ for j in stack:
+ new_ranking[j] = new_rank_value
+ old_value = v
+ tot_rankings = l
+ stack = [n]
+ else: # One more value with the same rank, keep it for the future
+ stack.append(n)
+ tot_rankings += l
+
+new_rank_value = 1.0 * tot_rankings / len(stack)
+#print new_rank_value
+for j in stack:
+ new_ranking[j] = new_rank_value
+
+#print new_ranking
+
+keys = new_ranking.keys()
+keys.sort()
+for k in keys:
+ print new_ranking[k]
diff --git a/structure/correlations/rank_nodes_thresh.py b/structure/correlations/rank_nodes_thresh.py
new file mode 100644
index 0000000..44b565a
--- /dev/null
+++ b/structure/correlations/rank_nodes_thresh.py
@@ -0,0 +1,87 @@
+####
+##
+##
+## Get a file as input, whose n-th line corresponds to the value of a
+## certain property of node n, and rank nodes according to their
+## properties, taking into account ranking ties properly.
+##
+## The output is a file whose n-th line is the "ranking" of the n-th
+## node according to the given property. (notice that rankings could
+## be fractional, due to the tie removal algorithm)
+##
+## The rank of a node is set to "0" (ZERO) if the corresponding
+## property is smaller than a value given as second parameter
+##
+
+
+import sys
+import math
+
+
+if len(sys.argv) < 3:
+ print "Usage: %s <filein> <thresh>" % sys.argv[0]
+ sys.exit(1)
+
+
+thresh = float(sys.argv[2])
+
+lines = open(sys.argv[1], "r").readlines()
+
+ranking = []
+
+n=0
+for l in lines:
+ if l[0] == "#" or l.strip(" \n").split(" ") == []:
+ continue
+ v = [float(x) if "." in x or "e" in x else int(x) for x in l.strip(" \n").split(" ")][0]
+ if v >= thresh:
+ ranking.append((v,n))
+ else:
+ ranking.append((0,n))
+ n +=1
+
+ranking.sort(reverse=True)
+#print ranking
+new_ranking = {}
+
+v0, n0 = ranking[0]
+
+
+old_value = v0
+tot_rankings = 1
+
+stack = [n0]
+l=1.0
+for v,n in ranking[1:]:
+ l += 1
+ ##print stack, tot_rankings
+ if v != old_value: ### There is a new rank
+ # We first compute the rank for all the nodes in the stack and then we set it
+ if old_value == 0:
+ new_rank_value = 0
+ else:
+ new_rank_value = 1.0 * tot_rankings / len(stack)
+ ##print new_rank_value
+ for j in stack:
+ new_ranking[j] = new_rank_value
+ old_value = v
+ tot_rankings = l
+ stack = [n]
+ else: # One more value with the same rank, keep it for the future
+ stack.append(n)
+ tot_rankings += l
+
+if v == 0 :
+ new_rank_value = 0
+else:
+ new_rank_value = 1.0 * tot_rankings / len(stack)
+#print new_rank_value
+for j in stack:
+ new_ranking[j] = new_rank_value
+
+#print new_ranking
+
+keys = new_ranking.keys()
+keys.sort()
+for k in keys:
+ print new_ranking[k]
diff --git a/structure/correlations/rank_occurrence.py b/structure/correlations/rank_occurrence.py
new file mode 100644
index 0000000..6339d5d
--- /dev/null
+++ b/structure/correlations/rank_occurrence.py
@@ -0,0 +1,45 @@
+####
+##
+## Get two rankings and compute the size of the k-intersection,
+## i.e. the number of elements which are present in the first k
+## positions of both rankings, as a function of k
+##
+##
+
+
+import sys
+
+
+if len(sys.argv)< 4:
+ print "Usage: %s <file1> <file2> <increment>" % sys.argv[0]
+ sys.exit(1)
+
+incr = int(sys.argv[3])
+
+rank1 = []
+rank2 = []
+
+lines = open(sys.argv[1], "r").readlines()
+
+for l in lines:
+ n= l.strip(" \n").split(" ")[0]
+ rank1.append(n)
+
+lines = open(sys.argv[2], "r").readlines()
+
+for l in lines:
+ n= l.strip(" \n").split(" ")[0]
+ rank2.append(n)
+
+
+N = len(rank1)
+
+i = incr
+
+while i < N+incr:
+ l = len(set(rank1[:i]) & set(rank2[:i]))
+ print i, l
+ i += incr
+
+
+
diff --git a/structure/correlations/rank_utils.c b/structure/correlations/rank_utils.c
new file mode 100644
index 0000000..4b8ef41
--- /dev/null
+++ b/structure/correlations/rank_utils.c
@@ -0,0 +1,217 @@
+/**
+ *
+ * Some utility functions to be used with tune_rho, compute_rho and
+ * the like
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+
+#include "rank_utils.h"
+
+void load_ranking(char *fname, int *N, double **R){
+
+ char buff[256];
+ int size = 10;
+ FILE *f;
+
+ if (*R == NULL){
+ *R = malloc(size * sizeof (double));
+ }
+ f = fopen(fname, "r");
+ if (!f){
+ printf("Unable to open file: %s!!! Exiting\n");
+ exit(1);
+ }
+
+ *N = 0;
+
+ while (fgets(buff, 255, f)){
+ if (* N == size){
+ size += 10;
+ *R = realloc(*R, size * sizeof(double));
+ }
+ (*R)[*N] = atof(buff);
+ *N += 1;
+ }
+}
+
+double avg_array(double *v, int N){
+
+ double sum = 0.0;
+ int i;
+
+ for (i = 0; i < N; i ++){
+ sum += v[i];
+ }
+ return sum/N;
+}
+
+double compute_C(double *R1, double *R2, int N){
+ double mu1, mu2, sum1, sum2;
+
+ mu1 = avg_array(R1, N);
+ mu2 = avg_array(R2, N);
+
+ sum1 = mu1 * N;
+ sum2 = mu2 * N;
+
+ return N * mu1 * mu2 - mu2 * sum1 - mu1 * sum2;
+}
+
+
+double compute_D(double *R1, double *R2, int N){
+
+ double mu1, mu2, s1, s2;
+ int i;
+
+ mu1 = avg_array(R1, N);
+ mu2 = avg_array(R1, N);
+
+ s1 = s2 = 0.0;
+
+ for (i=0 ; i < N; i ++){
+ s1 += pow((R1[i] - mu1), 2);
+ s2 += pow((R2[i] - mu2), 2);
+ }
+
+ return sqrt(s1 * s2);
+}
+
+
+double compute_rho(double *R1, double *R2, int N, int *pairing){
+
+ double rho = 0;
+ int i;
+
+ for (i=0; i < N; i ++){
+ rho += R1[i] * R2[ pairing[i] ];
+ }
+
+ rho = (rho + compute_C(R1, R2, N))/ compute_D(R1, R2, N);
+ return rho;
+}
+
+void dump_ranking(double *R, int N){
+ int i;
+
+ for (i=0; i < N; i ++){
+ printf("%d: %f\n", i, R[i] );
+ }
+}
+
+
+void init_pairing_natural(int *pairing, int N){
+ int i;
+
+ for (i = 0; i< N; i ++){
+ pairing[i] = i;
+ }
+}
+
+void init_pairing_inverse(int *pairing, int N){
+ int i;
+
+ for (i = 0; i< N; i ++){
+ pairing[i] = N-i-1;
+ }
+}
+
+void select_pairing(int *pairing, int N, int argc, char *argv[], int pos){
+
+ if (argc < pos + 1 || !strncasecmp("rnd", argv[pos], 3)){
+ init_pairing_random(pairing, N);
+ }
+ else if (!strncasecmp("nat", argv[pos], 3)){
+ init_pairing_natural(pairing, N);
+ }
+ else if (!strncasecmp("inv", argv[pos], 3)){
+ init_pairing_inverse(pairing, N);
+ }
+ else{
+ printf ("Pairing strategy \"%s\" unknown!!! Exiting...\n", argv[pos]);
+ exit(1);
+ }
+
+}
+
+
+void shuffle_sequence(int *s, int N){
+
+ int i, j, tmp;
+
+ for (i=N-1; i>=0; i--){
+ j = rand() % (i+1);
+ tmp = s[j];
+ s[j] = s[i];
+ s[i] = tmp;
+ }
+}
+
+
+
+void init_pairing_random(int *pairing, int N){
+
+ init_pairing_natural(pairing, N);
+ shuffle_sequence(pairing, N);
+
+}
+
+/* Loads a pairing from a file, in the format:
+ *
+ * rank1 rank2
+ * ...........
+ */
+void load_pairing(int **pairing, int N, char *fname){
+
+ FILE *f;
+ int i, j, num;
+ char buff[256];
+ char *ptr;
+
+ f = fopen(fname, "r");
+ if (!f){
+ printf("Error opening file \"%s\"!!!! Exiting....\n", fname);
+ exit(2);
+ }
+
+ if (*pairing == NULL){
+ *pairing = malloc(N * sizeof(int));
+ init_pairing_natural(*pairing, N);
+ }
+
+ num = 0;
+ while(num < N){
+ fgets(buff, 255, f);
+ if (buff[0] == '#')
+ continue;
+ ptr = strtok(buff, " "); /* read the first node */
+ i = atoi(ptr);
+ ptr = strtok(NULL, " "); /* read the second node */
+ j = atoi(ptr);
+ (*pairing)[i] = j;
+ num += 1;
+ }
+}
+
+
+void dump_pairing(int *pairing, int N){
+ int i;
+
+ for(i=0; i< N; i ++){
+ printf("%d %d\n", i, pairing[i]);
+ }
+}
+
+void copy_pairing(int *p1, int *p2, int N){
+ int i;
+
+ for (i=0 ; i < N; i ++){
+ p2[i] = p1[i];
+ }
+}
diff --git a/structure/correlations/rank_utils.h b/structure/correlations/rank_utils.h
new file mode 100644
index 0000000..521582a
--- /dev/null
+++ b/structure/correlations/rank_utils.h
@@ -0,0 +1,44 @@
+#ifndef __RANK_UTILS_H__
+#define __RANK_UTILS_H__
+
+/* Load a ranking */
+void load_ranking(char *fname, int *N, double **R);
+
+/* Compute the average of an array */
+double avg_array(double *v, int N);
+
+/* Compute the term "C" in the rho correlation coefficient */
+double compute_C(double *R1, double *R2, int N);
+
+/* Compute the term "D" in the rho correlation coefficient */
+double compute_D(double *R1, double *R2, int N);
+
+/* Compute the Spearman's rank correlation coefficient \rho */
+double compute_rho(double *R1, double *R2, int N, int *pairing);
+
+void dump_ranking(double *R, int N);
+
+void init_pairing_natural(int *pairing, int N);
+
+void init_pairing_inverse(int *pairing, int N);
+
+void init_pairing_random(int *pairing, int N);
+
+void select_pairing(int *pairing, int N, int argc, char *argv[], int pos);
+
+void shuffle_sequence(int *s, int N);
+
+void dump_pairing(int *pairing, int N);
+
+void copy_pairing(int *pairing1, int *pairing2, int N);
+
+
+#endif /* __RANK_UTILS_H__ */
+
+
+
+
+
+
+
+