/** * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see * . * * (c) Vincenzo Nicosia 2009-2017 -- * * This file is part of NetBunch, a package for complex network * analysis and modelling. For more information please visit: * * http://www.complex-networks.net/ * * If you use this software, please add a reference to * * V. Latora, V. Nicosia, G. Russo * "Complex Networks: Principles, Methods and Applications" * Cambridge University Press (2017) * ISBN: 9781107103184 * *********************************************************************** * * This program fits a set of values provided as input with a * power-law function using the Maximum-Likelihood Estimator (MLE). * * References: * * [1] A. Clauset, C. R. Shalizi, and M. E. J. Newman. "Power-law * distributions in empirical data". SIAM Rev. 51, (2007), * 661-703. * */ #include #include #include #include #include #include "utils.h" void usage(char *argv[]){ printf("********************************************************************\n" "** **\n" "** -*- fitmle -*- **\n" "** **\n" "** Fit a set of values with a power-law function using the **\n" "** Maximum-Likelihood Estimator (MLE). The program implements **\n" "** the MLE for both continuous and discrete values, and **\n" "** selects the appropriate one automatically. **\n" "** **\n" "** The input file 'data_in' contains one value on each row. **\n" "** If 'data_in' is '-' (dash), read the values from the **\n" "** standard input (STDIN). **\n" "** **\n" "** The program prints on output a single row, in the format: **\n" "** **\n" "** gamma x_min ks **\n" "** **\n" "** where 'gamma' is the exponent of the best fit, 'x_min' is **\n" "** the smallest of the input values after which the power-law **\n" "** hypothesis hold, and 'ks' is the value of the KS statistics **\n" "** for the best fit. **\n" "** **\n" "** The second (optional) parameter 'tol' sets the tolerance **\n" "** on the acceptable statistical error of the fit (set to **\n" "** 0.1 if no value is provided). **\n" "** **\n" "** If the third parameter is 'TEST', the program uses the **\n" "** the Kolmogorov-Smirnov test on a series of bootstrapped **\n" "** power-law sequences to estimate the p-value of the fit, and **\n" "** the output is in the format: **\n" "** **\n" "** gamma x_min ks p_value **\n" "** **\n" "** Higher values of 'p_value' correspond to better fits. **\n" "** **\n" "** If 'num_test' is provided, use that number of bootstrapped **\n" "** sequences to compute the p-value. Otherwise, use 100 **\n" "** sequences. **\n" "** **\n" "********************************************************************\n" " This is Free Software - You can use and distribute it under \n" " the terms of the GNU General Public License, version 3 or later\n\n" " (c) Vincenzo Nicosia 2010-2017 (v.nicosia@qmul.ac.uk)\n\n" "********************************************************************\n\n" ); printf("Usage: %s [ [ TEST []]]\n", argv[0]); } void load_data(char *fname, double **data, unsigned int *N, char sort){ FILE *fin; char error_str[256]; char buff[256]; char *ptr; double x; unsigned int size; size = 10; *data = malloc(size * sizeof(double)); *N=0; if (!strcmp(fname, "-")){ fin = stdin; } else{ fin = fopen(fname, "r"); } if (!fin){ sprintf(error_str, "Error opening file %s", fname); perror(error_str); exit(3); } while(fgets(buff, 256, fin)){ ptr = strtok(buff, " "); if (ptr[0] == '#') continue; x = atof(ptr); if (*N == size){ size += 10; *data = realloc(*data, size*sizeof(double)); } (*data)[*N] = x; *N += 1; } *data = realloc(*data, (*N)*sizeof(double)); if (sort){ qsort(*data, *N, sizeof(double), compare_double); } fclose(fin); return; } /** * find the first position at which the element x appears in the * "data" array using binary search (the array is sorted in * increasing order of x) */ int find_pos_x(double x,double *data, unsigned int N){ int H, L, M; L = 0; H = N-1; while(L<=H){ M = (H + L)/2; if (x == data[M]){ while(M >=0 && x == data[M]) M--; if (M==-1){ return 0; } return M+1; /* we return the index of the first appearance of element x*/ } else if (x < data[M]){ H = M-1; } else if (x > data[M]){ L = M+1; } } return -1; /* the value is not present */ } double fit_alpha(double *data, unsigned int N, double xmin, unsigned int idx){ double alpha = 0; int i; for(i = idx; i < N; i++){ alpha += log(data[i]*1.0/(xmin-0.5)); } alpha = 1 + (N - idx) * 1.0 / alpha; return alpha; } double fit_alpha_continuous(double *data, unsigned int N, double xmin, unsigned int idx){ double alpha = 0; int i; for(i = idx; i < N; i++){ alpha += log(data[i]*1.0/(xmin)); } alpha = 1 + (N - idx) * 1.0 / alpha; return alpha; } /* * * Kolmogorov-Smirnov test * */ double KS (double *data, unsigned int N, double alpha, int idx){ double c_data, c_theo, val; double xmin, x; double dist, max_dist = -1; int i; c_data = c_theo = 0.0; xmin = data[idx]; for (i=idx; i max_dist){ max_dist = dist; } i++; } c_data = 1.0 * (i- idx)/(N-idx); } return max_dist; } void dump_data_double(double *v, unsigned int N){ int i; if (N < 1){ return; } printf("%g",v[0]); for(i=1; i ks){ num += 1; } } free(v); return 1.0 * num / num_test; } /* We assume that the data set has already been sorted in ascending order */ int is_continuous(double *data, int N){ int i; for (i=0; i 2) tol = atof(argv[2]); else tol = 0.1; if (argc > 3 && !my_strcasecmp(argv[3], "TEST")){ test = 1; } if(argc > 4){ num_test = atoi(argv[4]); } else num_test = 100; srand(time(NULL)); load_data(argv[1], &data, &N, 1); if(is_continuous(data, N)){ fprintf(stderr, "Using continuous fit\n"); fit_func = best_fit_continuous; } else{ fprintf(stderr, "Using discrete fit\n"); fit_func = best_fit; } scaling_fact = 1.0; fit_func(data, N, &alpha, &xmin, tol); i = find_pos_x(xmin, data, N); ks = KS(data, N, alpha, i); if (test){ p_value = test_powerlaw(data, N, alpha, xmin, ks, num_test, fit_func); printf("%g %g %g %g\n", alpha, xmin/scaling_fact, ks, p_value); } else{ printf("%g %g %g\n", alpha, xmin / scaling_fact, ks); } free(data); }