summaryrefslogtreecommitdiff
path: root/src/fitmle/fitmle.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/fitmle/fitmle.c')
-rw-r--r--src/fitmle/fitmle.c479
1 files changed, 479 insertions, 0 deletions
diff --git a/src/fitmle/fitmle.c b/src/fitmle/fitmle.c
new file mode 100644
index 0000000..53cf448
--- /dev/null
+++ b/src/fitmle/fitmle.c
@@ -0,0 +1,479 @@
+/**
+ * This program is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * (c) Vincenzo Nicosia 2009-2017 -- <v.nicosia@qmul.ac.uk>
+ *
+ * This file is part of NetBunch, a package for complex network
+ * analysis and modelling. For more information please visit:
+ *
+ * http://www.complex-networks.net/
+ *
+ * If you use this software, please add a reference to
+ *
+ * V. Latora, V. Nicosia, G. Russo
+ * "Complex Networks: Principles, Methods and Applications"
+ * Cambridge University Press (2017)
+ * ISBN: 9781107103184
+ *
+ ***********************************************************************
+ *
+ * This program fits a set of values provided as input with a
+ * power-law function using the Maximum-Likelihood Estimator (MLE).
+ *
+ * References:
+ *
+ * [1] A. Clauset, C. R. Shalizi, and M. E. J. Newman. "Power-law
+ * distributions in empirical data". SIAM Rev. 51, (2007),
+ * 661-703.
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <time.h>
+
+#include "utils.h"
+
+void usage(char *argv[]){
+
+ printf("********************************************************************\n"
+ "** **\n"
+ "** -*- fitmle -*- **\n"
+ "** **\n"
+ "** Fit a set of values with a power-law function using the **\n"
+ "** Maximum-Likelihood Estimator (MLE). The program implements **\n"
+ "** the MLE for both continuous and discrete values, and **\n"
+ "** selects the appropriate one automatically. **\n"
+ "** **\n"
+ "** The input file 'data_in' contains one value on each row. **\n"
+ "** If 'data_in' is '-' (dash), read the values from the **\n"
+ "** standard input (STDIN). **\n"
+ "** **\n"
+ "** The program prints on output a single row, in the format: **\n"
+ "** **\n"
+ "** gamma x_min ks **\n"
+ "** **\n"
+ "** where 'gamma' is the exponent of the best fit, 'x_min' is **\n"
+ "** the smallest of the input values after which the power-law **\n"
+ "** hypothesis hold, and 'ks' is the value of the KS statistics **\n"
+ "** for the best fit. **\n"
+ "** **\n"
+ "** The second (optional) parameter 'tol' sets the tolerance **\n"
+ "** on the acceptable statistical error of the fit (set to **\n"
+ "** 0.1 if no value is provided). **\n"
+ "** **\n"
+ "** If the third parameter is 'TEST', the program uses the **\n"
+ "** the Kolmogorov-Smirnov test on a series of bootstrapped **\n"
+ "** power-law sequences to estimate the p-value of the fit, and **\n"
+ "** the output is in the format: **\n"
+ "** **\n"
+ "** gamma x_min ks p_value **\n"
+ "** **\n"
+ "** Higher values of 'p_value' correspond to better fits. **\n"
+ "** **\n"
+ "** If 'num_test' is provided, use that number of bootstrapped **\n"
+ "** sequences to compute the p-value. Otherwise, use 100 **\n"
+ "** sequences. **\n"
+ "** **\n"
+ "********************************************************************\n"
+ " This is Free Software - You can use and distribute it under \n"
+ " the terms of the GNU General Public License, version 3 or later\n\n"
+ " (c) Vincenzo Nicosia 2010-2017 (v.nicosia@qmul.ac.uk)\n\n"
+ "********************************************************************\n\n"
+ );
+ printf("Usage: %s <data_in> [<tol> [ TEST [<num_test>]]]\n", argv[0]);
+}
+
+
+void load_data(char *fname, double **data, unsigned int *N, char sort){
+
+ FILE *fin;
+ char error_str[256];
+ char buff[256];
+ char *ptr;
+ double x;
+ unsigned int size;
+
+ size = 10;
+
+ *data = malloc(size * sizeof(double));
+
+ *N=0;
+ if (!strcmp(fname, "-")){
+ fin = stdin;
+ }
+ else{
+ fin = fopen(fname, "r");
+ }
+ if (!fin){
+ sprintf(error_str, "Error opening file %s", fname);
+ perror(error_str);
+ exit(3);
+ }
+
+ while(fgets(buff, 256, fin)){
+ ptr = strtok(buff, " ");
+ if (ptr[0] == '#')
+ continue;
+ x = atof(ptr);
+ if (*N == size){
+ size += 10;
+ *data = realloc(*data, size*sizeof(double));
+ }
+ (*data)[*N] = x;
+ *N += 1;
+ }
+ *data = realloc(*data, (*N)*sizeof(double));
+ if (sort){
+ qsort(*data, *N, sizeof(double), compare_double);
+ }
+ fclose(fin);
+ return;
+}
+
+/**
+ * find the first position at which the element x appears in the
+ * "data" array using binary search (the array is sorted in
+ * increasing order of x)
+ */
+int find_pos_x(double x,double *data, unsigned int N){
+ int H, L, M;
+
+ L = 0;
+ H = N-1;
+
+ while(L<=H){
+ M = (H + L)/2;
+ if (x == data[M]){
+ while(M >=0 && x == data[M]) M--;
+ if (M==-1){
+ return 0;
+ }
+ return M+1; /* we return the index of the first appearance of element x*/
+ }
+ else if (x < data[M]){
+ H = M-1;
+ }
+ else if (x > data[M]){
+ L = M+1;
+ }
+ }
+ return -1; /* the value is not present */
+}
+
+double fit_alpha(double *data, unsigned int N, double xmin, unsigned int idx){
+
+ double alpha = 0;
+ int i;
+
+ for(i = idx; i < N; i++){
+ alpha += log(data[i]*1.0/(xmin-0.5));
+ }
+ alpha = 1 + (N - idx) * 1.0 / alpha;
+ return alpha;
+}
+
+
+double fit_alpha_continuous(double *data, unsigned int N, double xmin, unsigned int idx){
+
+ double alpha = 0;
+ int i;
+
+ for(i = idx; i < N; i++){
+ alpha += log(data[i]*1.0/(xmin));
+ }
+ alpha = 1 + (N - idx) * 1.0 / alpha;
+ return alpha;
+}
+
+
+/*
+ *
+ * Kolmogorov-Smirnov test
+ *
+ */
+
+double KS (double *data, unsigned int N, double alpha, int idx){
+
+ double c_data, c_theo, val;
+ double xmin, x;
+ double dist, max_dist = -1;
+ int i;
+
+ c_data = c_theo = 0.0;
+ xmin = data[idx];
+
+ for (i=idx; i<N;){
+ x = data[i];
+ while(i<N && data[i] == x){
+ val = xmin * 1.0 / data[i];
+ c_theo = 1.0 - pow(val, alpha-1.0);
+ dist = fabs(c_theo - c_data);
+
+ if (dist > max_dist){
+ max_dist = dist;
+ }
+ i++;
+ }
+ c_data = 1.0 * (i- idx)/(N-idx);
+ }
+ return max_dist;
+}
+
+void dump_data_double(double *v, unsigned int N){
+ int i;
+
+ if (N < 1){
+ return;
+ }
+ printf("%g",v[0]);
+ for(i=1; i<N; i++){
+ printf(" %g",v[i]);
+ }
+ printf("\n");
+}
+
+
+/* This is the same as best_fit, but for continuous-valued time-series */
+void best_fit_continuous(double *data, unsigned int N, double *alpha, double *xmin, double tol){
+
+ double min_x, max_x, x;
+ double cur_alpha, best_alpha, best_x, ks_test, best_ks;
+ int idx,i;
+
+
+
+
+ qsort(data, N, sizeof(double), compare_double);
+
+ min_x = data[0];
+ best_x = data[0];
+ max_x = data[N-1];
+ best_ks = 10000000;
+ cur_alpha = 0.0;
+ best_alpha = 0.0;
+ idx = 0;
+ for(x=min_x, i=0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol; ){
+ idx = find_pos_x(x, data, N);
+ if (idx <0){
+ i ++;
+ continue;
+ }
+ cur_alpha = fit_alpha_continuous(data, N, x, idx);
+ ks_test = KS(data, N, cur_alpha, idx);
+ if(ks_test < best_ks){
+ best_ks = ks_test;
+ best_alpha=cur_alpha;
+ best_x = data[idx];
+ }
+ while(data[i] == x && i <N){
+ i ++;
+ }
+ x = data[i];
+ }
+
+ *alpha = best_alpha;
+ *xmin = best_x;
+ return;
+}
+
+/* Fit a discrete power-law */
+
+void best_fit(double *data, unsigned int N, double *alpha, double *xmin, double tol){
+
+ double min_x, max_x, x;
+ double cur_alpha, best_alpha, best_x, ks_test, best_ks;
+ int idx, i;
+
+
+
+ qsort(data, N, sizeof(double), compare_double);
+ min_x = data[0];
+ best_x = data[0];
+ max_x = data[N-1];
+ best_ks = 10000000;
+ cur_alpha = 0.0;
+ best_alpha = 0.0;
+ idx = 0;
+ for(x=min_x, i = 0; x<=max_x && (cur_alpha)/sqrt(1.0 * (N-idx))< tol;){
+ idx = find_pos_x(x, data, N);
+
+ if (idx<0) {
+ i ++;
+ continue;
+ }
+ cur_alpha = fit_alpha(data, N, x, idx);
+
+ ks_test = KS(data, N, cur_alpha, idx);
+ if(ks_test < best_ks){
+ best_ks = ks_test;
+ best_alpha=cur_alpha;
+ best_x = data[idx];
+ }
+ while(data[i] == x && i<N){
+ i ++;
+ }
+ x = data[i];
+ }
+
+ *alpha = best_alpha;
+ *xmin = best_x;
+ return;
+}
+
+void sample_powerlaw(double *data, unsigned int N, double alpha, double xmin,double *v){
+ int i, last_idx, r;
+ double val;
+
+ i=0;
+ last_idx = 0;
+ while(data[i] < xmin){
+ last_idx ++;
+ i++;
+ }
+
+ i=0;
+ while(i <last_idx){
+ r = rand() % (last_idx + 1);
+ v[i] = data[r];
+ i++;
+ }
+
+ for(; i<N; i++){
+ val = 1.0 * rand()/RAND_MAX;
+ v[i] = floor(xmin * pow(1.0-val, -1.0/(alpha-1)));
+
+ }
+
+}
+
+
+double test_powerlaw(double *data, unsigned int N, double alpha, double xmin,
+ double ks, int num_test,
+ void (*f)(double *, unsigned int , double *, double *, double)){
+
+ int num = 0, i, idx;
+ double *v, cur_alpha, cur_xmin, cur_ks;
+
+
+ v = malloc(N * sizeof(double));
+
+ for(i=0; i<num_test; i++){
+ sample_powerlaw(data, N, alpha, xmin, v);
+ qsort(v, N, sizeof(double), compare_double);
+ f(v, N, &cur_alpha, &cur_xmin, 0.1);
+ idx = find_pos_x(cur_xmin, v, N);
+ cur_ks = KS(v, N, cur_alpha, idx);
+ if (cur_ks > ks){
+ num += 1;
+ }
+ }
+ free(v);
+ return 1.0 * num / num_test;
+}
+
+/* We assume that the data set has already been sorted in ascending
+ order */
+int is_continuous(double *data, int N){
+ int i;
+
+ for (i=0; i<N; i++){
+ if (data[i] - (double)(int)data[i] != 0.0)
+ return 1;
+ }
+ /* return 1 only if we need a real-valued fit....*/
+ return 0;
+}
+
+/* we assume that the data set has already been sorted in ascending
+ order */
+double renormalise(double *data, unsigned int N){
+
+ int i;
+ double scaling = 1.0;
+
+ if (data[0] < 1.0 ){
+ scaling = 1.0/data[0];
+ for (i=0; i<N; i++){
+ data[i] *= scaling;
+ }
+ }
+ return scaling;
+}
+
+
+int main(int argc, char *argv[]){
+
+ double *data=NULL;
+ unsigned int N, i;
+ double alpha, xmin, ks, tol, p_value, scaling_fact;
+ char test = 0;
+ int num_test;
+
+ void (*fit_func)(double *, unsigned int , double *, double *, double);
+
+
+ if (argc < 2){
+ usage(argv);
+ exit(1);
+ }
+
+ if (argc > 2)
+ tol = atof(argv[2]);
+ else
+ tol = 0.1;
+
+ if (argc > 3 && !my_strcasecmp(argv[3], "TEST")){
+ test = 1;
+ }
+
+ if(argc > 4){
+ num_test = atoi(argv[4]);
+ }
+ else
+ num_test = 100;
+
+ srand(time(NULL));
+
+
+
+ load_data(argv[1], &data, &N, 1);
+
+ if(is_continuous(data, N)){
+ fprintf(stderr, "Using continuous fit\n");
+ fit_func = best_fit_continuous;
+ }
+ else{
+ fprintf(stderr, "Using discrete fit\n");
+ fit_func = best_fit;
+ }
+
+ scaling_fact = 1.0;
+
+ fit_func(data, N, &alpha, &xmin, tol);
+ i = find_pos_x(xmin, data, N);
+ ks = KS(data, N, alpha, i);
+ if (test){
+ p_value = test_powerlaw(data, N, alpha, xmin, ks, num_test, fit_func);
+ printf("%g %g %g %g\n", alpha, xmin/scaling_fact, ks, p_value);
+ }
+ else{
+ printf("%g %g %g\n", alpha, xmin / scaling_fact, ks);
+ }
+ free(data);
+}