1 files changed, 479 insertions, 0 deletions
diff --git a/src/fitmle/fitmle.c b/src/fitmle/fitmle.c
new file mode 100644
index 0000000..53cf448
--- /dev/null
+++ b/src/fitmle/fitmle.c
@@ -0,0 +1,479 @@
+/**
+ *  This program is free software: you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, either version 3 of the
+ *  License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see
+ *  <http://www.gnu.org/licenses/>.
+ *
+ *  (c) Vincenzo Nicosia 2009-2017 -- <v.nicosia@qmul.ac.uk>
+ * 
+ *  This file is part of NetBunch, a package for complex network
+ *  analysis and modelling. For more information please visit:
+ *
+ *             http://www.complex-networks.net/
+ *
+ *  If you use this software, please add a reference to 
+ *
+ *               V. Latora, V. Nicosia, G. Russo             
+ *   "Complex Networks: Principles, Methods and Applications"
+ *              Cambridge University Press (2017) 
+ *                ISBN: 9781107103184
+ *
+ ***********************************************************************
+ *
+ *  This program fits a set of values provided as input with a
+ *  power-law function using the Maximum-Likelihood Estimator (MLE).
+ *
+ *  References: 
+ * 
+ * [1] A. Clauset, C. R. Shalizi, and M. E. J. Newman. "Power-law
+ *     distributions in empirical data". SIAM Rev. 51, (2007),
+ *     661-703.  
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+
+#include "utils.h"
+
+void usage(char *argv[]){
+
+  printf("********************************************************************\n"
+         "**                                                                **\n"
+         "**                        -*-   fitmle   -*-                      **\n"
+         "**                                                                **\n"
+         "**    Fit a set of values with a power-law function using the     **\n"
+         "**    Maximum-Likelihood Estimator (MLE). The program implements  **\n"
+         "**    the MLE for both continuous and discrete values, and        **\n"
+         "**    selects the appropriate one automatically.                  **\n"
+         "**                                                                **\n"
+         "**    The input file 'data_in' contains one value on each row.    **\n"
+         "**    If 'data_in' is '-' (dash), read the values from the        **\n"
+         "**    standard input (STDIN).                                     **\n"
+         "**                                                                **\n"
+         "**    The program prints on output a single row, in the format:   **\n"
+         "**                                                                **\n"
+         "**       gamma x_min ks                                           **\n"
+         "**                                                                **\n"
+         "**    where 'gamma' is the exponent of the best fit, 'x_min' is   **\n"
+         "**    the smallest of the input values after which the power-law  **\n"
+         "**    hypothesis hold, and 'ks' is the value of the KS statistics **\n"
+         "**    for the best fit.                                           **\n"
+         "**                                                                **\n"
+         "**    The second (optional) parameter 'tol' sets the tolerance    **\n"
+         "**    on the acceptable statistical error of the fit (set to      **\n"
+         "**    0.1 if no value is provided).                               **\n"
+         "**                                                                **\n"
+         "**    If the third parameter is 'TEST', the program uses the      **\n"
+         "**    the Kolmogorov-Smirnov test on a series of bootstrapped     **\n"
+         "**    power-law sequences to estimate the p-value of the fit, and **\n"
+         "**    the output is in the format:                                **\n"
+         "**                                                                **\n"
+         "**       gamma x_min ks p_value                                   **\n"
+         "**                                                                **\n"
+         "**    Higher values of 'p_value' correspond to better fits.       **\n"
+         "**                                                                **\n"
+         "**    If 'num_test' is provided, use that number of bootstrapped  **\n"
+         "**    sequences to compute the p-value. Otherwise, use 100        **\n"
+         "**    sequences.                                                  **\n"
+         "**                                                                **\n"
+         "********************************************************************\n"
+         " This is Free Software - You can use and distribute it under \n"
+         " the terms of the GNU General Public License, version 3 or later\n\n"
+         " (c) Vincenzo Nicosia 2010-2017 (v.nicosia@qmul.ac.uk)\n\n"
+         "********************************************************************\n\n"
+         );
+  printf("Usage: %s <data_in> [<tol> [ TEST [<num_test>]]]\n", argv[0]);
+}
+
+
+void load_data(char *fname, double **data, unsigned int *N, char sort){
+  
+  FILE *fin;
+  char error_str[256];
+  char buff[256];
+  char *ptr;
+  double x;
+  unsigned int size;
+  
+  size = 10;
+
+  *data = malloc(size * sizeof(double));
+
+  *N=0;
+  if (!strcmp(fname, "-")){
+    fin = stdin;
+  }
+  else{
+    fin = fopen(fname, "r");
+  }
+  if (!fin){
+    sprintf(error_str, "Error opening file %s", fname);
+    perror(error_str);
+    exit(3);
+  }
+  
+  while(fgets(buff, 256, fin)){
+    ptr = strtok(buff, " ");
+    if (ptr[0] == '#')
+      continue;
+    x = atof(ptr);
+    if (*N == size){
+      size += 10;
+      *data = realloc(*data, size*sizeof(double));
+    }
+    (*data)[*N] = x;
+    *N += 1;
+  }
+  *data = realloc(*data, (*N)*sizeof(double));
+  if (sort){
+    qsort(*data, *N, sizeof(double), compare_double);
+  }
+  fclose(fin);
+  return;
+}
+
+/** 
+ *  find the first position at which the element x appears in the
+ *  "data" array using binary search (the array is sorted in
+ *  increasing order of x)
+ */
+int find_pos_x(double x,double *data, unsigned int N){
+  int H, L, M;
+
+  L = 0;
+  H = N-1;
+  
+  while(L<=H){
+    M = (H + L)/2;
+    if (x == data[M]){
+      while(M >=0 && x == data[M]) M--;
+      if (M==-1){
+        return 0;
+      }
+      return M+1; /* we return the index of the first appearance of element x*/
+    }
+    else if (x < data[M]){
+      H = M-1;
+    }
+    else if (x > data[M]){
+      L = M+1;
+    }
+  }
+  return -1; /* the value is not present */
+}
+
+double fit_alpha(double *data, unsigned int N, double xmin, unsigned int idx){
+  
+  double alpha = 0;
+  int i;
+  
+  for(i = idx; i < N; i++){
+    alpha += log(data[i]*1.0/(xmin-0.5));
+  }
+  alpha = 1 + (N - idx) * 1.0 / alpha;
+  return alpha;
+}
+
+
+double fit_alpha_continuous(double *data, unsigned int N, double xmin, unsigned int idx){
+  
+  double alpha = 0;
+  int i;
+  
+  for(i = idx; i < N; i++){
+    alpha += log(data[i]*1.0/(xmin));
+  }
+  alpha = 1 + (N - idx) * 1.0 / alpha;
+  return alpha;
+}
+
+
+/*
+ *
+ * Kolmogorov-Smirnov test
+ *
+ */
+
+double KS (double *data, unsigned int N, double alpha, int idx){
+  
+  double c_data, c_theo, val;
+  double xmin, x;
+  double dist, max_dist = -1;
+  int i;
+
+  c_data = c_theo = 0.0;
+  xmin = data[idx];
+
+  for (i=idx; i<N;){
+    x = data[i];
+    while(i<N && data[i] == x){
+      val = xmin * 1.0 / data[i];
+      c_theo = 1.0 -  pow(val, alpha-1.0);
+      dist = fabs(c_theo - c_data);
+
+      if (dist > max_dist){
+        max_dist = dist;
+      }
+      i++;
+    }
+    c_data = 1.0 * (i- idx)/(N-idx);
+  }
+  return max_dist;
+}
+
+void dump_data_double(double *v, unsigned int N){
+  int i;
+
+  if (N < 1){
+    return;
+  }
+  printf("%g",v[0]);
+  for(i=1; i<N; i++){
+    printf(" %g",v[i]);
+  }
+  printf("\n");
+}
+
+
+/* This is the same as best_fit, but for continuous-valued time-series */
+void best_fit_continuous(double *data, unsigned int N, double *alpha, double *xmin, double tol){
+  
+  double min_x, max_x, x;
+  double cur_alpha, best_alpha, best_x, ks_test, best_ks;
+  int idx,i;
+  
+
+ 
+
+  qsort(data, N, sizeof(double), compare_double);
+
+  min_x = data[0];
+  best_x = data[0];
+  max_x = data[N-1];
+  best_ks = 10000000;
+  cur_alpha = 0.0;
+  best_alpha = 0.0;
+  idx = 0;
+  for(x=min_x, i=0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol; ){
+    idx = find_pos_x(x, data, N);
+    if (idx <0){ 
+      i ++;
+      continue;
+    }
+    cur_alpha = fit_alpha_continuous(data, N, x, idx);
+    ks_test = KS(data, N, cur_alpha, idx);
+    if(ks_test < best_ks){
+      best_ks = ks_test;
+      best_alpha=cur_alpha;
+      best_x = data[idx];
+    }
+    while(data[i] == x && i <N){
+      i ++;
+    }
+    x = data[i];
+  }
+
+  *alpha = best_alpha;
+  *xmin = best_x;
+  return;
+}
+
+/* Fit a discrete power-law */
+
+void best_fit(double *data, unsigned int N, double *alpha, double *xmin, double tol){
+  
+  double min_x, max_x, x;
+  double cur_alpha, best_alpha, best_x, ks_test, best_ks;
+  int idx, i;
+  
+
+  
+  qsort(data, N, sizeof(double), compare_double);
+  min_x = data[0];
+  best_x = data[0];
+  max_x = data[N-1];
+  best_ks = 10000000;
+  cur_alpha = 0.0;
+  best_alpha = 0.0;
+  idx = 0;
+  for(x=min_x, i = 0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol;){
+    idx = find_pos_x(x, data, N);
+
+    if (idx<0) {
+      i ++;
+      continue;
+    }
+    cur_alpha = fit_alpha(data, N, x, idx);
+
+    ks_test = KS(data, N, cur_alpha, idx);
+    if(ks_test < best_ks){
+      best_ks = ks_test;
+      best_alpha=cur_alpha;
+      best_x = data[idx];
+    }
+    while(data[i] == x && i<N){
+      i ++;
+    }
+    x = data[i];
+  }
+
+  *alpha = best_alpha;
+  *xmin = best_x;
+  return;
+}
+
+void sample_powerlaw(double *data, unsigned int N, double alpha, double xmin,double *v){
+  int i, last_idx, r;
+  double val;
+  
+  i=0;
+  last_idx = 0;
+  while(data[i] < xmin){
+    last_idx ++;
+    i++;
+  }
+  
+  i=0;
+  while(i <last_idx){
+    r = rand() % (last_idx + 1);
+    v[i] = data[r];
+    i++;
+  }
+  
+  for(; i<N; i++){
+    val = 1.0 * rand()/RAND_MAX;
+    v[i] = floor(xmin * pow(1.0-val, -1.0/(alpha-1)));
+
+  }
+  
+}
+
+
+double test_powerlaw(double *data, unsigned int N, double alpha, double xmin, 
+                     double ks, int num_test, 
+                     void (*f)(double *, unsigned int , double *, double *, double)){
+  
+  int num = 0, i, idx;
+  double *v, cur_alpha, cur_xmin, cur_ks;
+
+
+  v = malloc(N * sizeof(double));
+
+  for(i=0; i<num_test; i++){
+    sample_powerlaw(data, N, alpha, xmin, v);
+    qsort(v, N, sizeof(double), compare_double);
+    f(v, N, &cur_alpha, &cur_xmin, 0.1); 
+    idx = find_pos_x(cur_xmin, v, N);
+    cur_ks = KS(v, N, cur_alpha, idx);
+    if (cur_ks > ks){
+      num += 1;
+    }
+  }
+  free(v);
+  return 1.0 * num / num_test;
+}
+
+/* We assume that the data set has already been sorted in ascending
+   order */
+int is_continuous(double *data, int N){
+  int i;
+
+  for (i=0; i<N; i++){
+    if (data[i] - (double)(int)data[i] != 0.0)
+      return 1;
+  }
+  /* return 1 only if we need a real-valued fit....*/
+  return 0;
+}
+
+/* we assume that the data set has already been sorted in ascending
+   order */
+double renormalise(double *data, unsigned int N){
+  
+  int i;
+  double scaling = 1.0;
+  
+  if (data[0] < 1.0 ){
+    scaling = 1.0/data[0];
+    for (i=0; i<N; i++){
+      data[i] *= scaling;
+    }
+  }
+  return scaling;
+}
+
+
+int main(int argc, char *argv[]){
+  
+  double *data=NULL;
+  unsigned int N, i;
+  double alpha, xmin, ks, tol, p_value, scaling_fact;
+  char test = 0;
+  int num_test;
+  
+  void  (*fit_func)(double *, unsigned int , double *, double *, double);
+  
+
+  if (argc < 2){
+    usage(argv);
+    exit(1);
+  }
+  
+  if (argc > 2)
+    tol = atof(argv[2]);
+  else
+    tol = 0.1;
+
+  if (argc > 3 && !my_strcasecmp(argv[3], "TEST")){
+    test = 1;
+  }
+  
+  if(argc > 4){
+    num_test = atoi(argv[4]);
+  }
+  else
+    num_test = 100;
+  
+  srand(time(NULL));
+  
+
+  
+  load_data(argv[1], &data, &N, 1);
+
+  if(is_continuous(data, N)){
+    fprintf(stderr, "Using continuous fit\n");
+    fit_func = best_fit_continuous;
+  }
+  else{
+    fprintf(stderr, "Using discrete fit\n");
+    fit_func = best_fit;
+  }
+
+  scaling_fact = 1.0;
+
+  fit_func(data, N, &alpha, &xmin, tol);
+  i = find_pos_x(xmin, data, N);
+  ks = KS(data, N, alpha, i);
+  if (test){
+    p_value = test_powerlaw(data, N, alpha, xmin, ks, num_test, fit_func);
+    printf("%g %g %g %g\n", alpha,  xmin/scaling_fact, ks, p_value);
+  }
+  else{
+    printf("%g %g %g\n", alpha, xmin / scaling_fact, ks);
+  }
+  free(data);
+}