/**
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see
* .
*
* (c) Vincenzo Nicosia 2009-2017 --
*
* This file is part of NetBunch, a package for complex network
* analysis and modelling. For more information please visit:
*
* http://www.complex-networks.net/
*
* If you use this software, please add a reference to
*
* V. Latora, V. Nicosia, G. Russo
* "Complex Networks: Principles, Methods and Applications"
* Cambridge University Press (2017)
* ISBN: 9781107103184
*
***********************************************************************
*
* This program fits a set of values provided as input with a
* power-law function using the Maximum-Likelihood Estimator (MLE).
*
* References:
*
* [1] A. Clauset, C. R. Shalizi, and M. E. J. Newman. "Power-law
* distributions in empirical data". SIAM Rev. 51, (2007),
* 661-703.
*
*/
#include
#include
#include
#include
#include
#include "utils.h"
void usage(char *argv[]){
printf("********************************************************************\n"
"** **\n"
"** -*- fitmle -*- **\n"
"** **\n"
"** Fit a set of values with a power-law function using the **\n"
"** Maximum-Likelihood Estimator (MLE). The program implements **\n"
"** the MLE for both continuous and discrete values, and **\n"
"** selects the appropriate one automatically. **\n"
"** **\n"
"** The input file 'data_in' contains one value on each row. **\n"
"** If 'data_in' is '-' (dash), read the values from the **\n"
"** standard input (STDIN). **\n"
"** **\n"
"** The program prints on output a single row, in the format: **\n"
"** **\n"
"** gamma x_min ks **\n"
"** **\n"
"** where 'gamma' is the exponent of the best fit, 'x_min' is **\n"
"** the smallest of the input values after which the power-law **\n"
"** hypothesis hold, and 'ks' is the value of the KS statistics **\n"
"** for the best fit. **\n"
"** **\n"
"** The second (optional) parameter 'tol' sets the tolerance **\n"
"** on the acceptable statistical error of the fit (set to **\n"
"** 0.1 if no value is provided). **\n"
"** **\n"
"** If the third parameter is 'TEST', the program uses the **\n"
"** the Kolmogorov-Smirnov test on a series of bootstrapped **\n"
"** power-law sequences to estimate the p-value of the fit, and **\n"
"** the output is in the format: **\n"
"** **\n"
"** gamma x_min ks p_value **\n"
"** **\n"
"** Higher values of 'p_value' correspond to better fits. **\n"
"** **\n"
"** If 'num_test' is provided, use that number of bootstrapped **\n"
"** sequences to compute the p-value. Otherwise, use 100 **\n"
"** sequences. **\n"
"** **\n"
"********************************************************************\n"
" This is Free Software - You can use and distribute it under \n"
" the terms of the GNU General Public License, version 3 or later\n\n"
" (c) Vincenzo Nicosia 2010-2017 (v.nicosia@qmul.ac.uk)\n\n"
"********************************************************************\n\n"
);
printf("Usage: %s [ [ TEST []]]\n", argv[0]);
}
void load_data(char *fname, double **data, unsigned int *N, char sort){
FILE *fin;
char error_str[256];
char buff[256];
char *ptr;
double x;
unsigned int size;
size = 10;
*data = malloc(size * sizeof(double));
*N=0;
if (!strcmp(fname, "-")){
fin = stdin;
}
else{
fin = fopen(fname, "r");
}
if (!fin){
sprintf(error_str, "Error opening file %s", fname);
perror(error_str);
exit(3);
}
while(fgets(buff, 256, fin)){
ptr = strtok(buff, " ");
if (ptr[0] == '#')
continue;
x = atof(ptr);
if (*N == size){
size += 10;
*data = realloc(*data, size*sizeof(double));
}
(*data)[*N] = x;
*N += 1;
}
*data = realloc(*data, (*N)*sizeof(double));
if (sort){
qsort(*data, *N, sizeof(double), compare_double);
}
fclose(fin);
return;
}
/**
* find the first position at which the element x appears in the
* "data" array using binary search (the array is sorted in
* increasing order of x)
*/
int find_pos_x(double x,double *data, unsigned int N){
int H, L, M;
L = 0;
H = N-1;
while(L<=H){
M = (H + L)/2;
if (x == data[M]){
while(M >=0 && x == data[M]) M--;
if (M==-1){
return 0;
}
return M+1; /* we return the index of the first appearance of element x*/
}
else if (x < data[M]){
H = M-1;
}
else if (x > data[M]){
L = M+1;
}
}
return -1; /* the value is not present */
}
double fit_alpha(double *data, unsigned int N, double xmin, unsigned int idx){
double alpha = 0;
int i;
for(i = idx; i < N; i++){
alpha += log(data[i]*1.0/(xmin-0.5));
}
alpha = 1 + (N - idx) * 1.0 / alpha;
return alpha;
}
double fit_alpha_continuous(double *data, unsigned int N, double xmin, unsigned int idx){
double alpha = 0;
int i;
for(i = idx; i < N; i++){
alpha += log(data[i]*1.0/(xmin));
}
alpha = 1 + (N - idx) * 1.0 / alpha;
return alpha;
}
/*
*
* Kolmogorov-Smirnov test
*
*/
double KS (double *data, unsigned int N, double alpha, int idx){
double c_data, c_theo, val;
double xmin, x;
double dist, max_dist = -1;
int i;
c_data = c_theo = 0.0;
xmin = data[idx];
for (i=idx; i max_dist){
max_dist = dist;
}
i++;
}
c_data = 1.0 * (i- idx)/(N-idx);
}
return max_dist;
}
void dump_data_double(double *v, unsigned int N){
int i;
if (N < 1){
return;
}
printf("%g",v[0]);
for(i=1; i ks){
num += 1;
}
}
free(v);
return 1.0 * num / num_test;
}
/* We assume that the data set has already been sorted in ascending
order */
int is_continuous(double *data, int N){
int i;
for (i=0; i 2)
tol = atof(argv[2]);
else
tol = 0.1;
if (argc > 3 && !my_strcasecmp(argv[3], "TEST")){
test = 1;
}
if(argc > 4){
num_test = atoi(argv[4]);
}
else
num_test = 100;
srand(time(NULL));
load_data(argv[1], &data, &N, 1);
if(is_continuous(data, N)){
fprintf(stderr, "Using continuous fit\n");
fit_func = best_fit_continuous;
}
else{
fprintf(stderr, "Using discrete fit\n");
fit_func = best_fit;
}
scaling_fact = 1.0;
fit_func(data, N, &alpha, &xmin, tol);
i = find_pos_x(xmin, data, N);
ks = KS(data, N, alpha, i);
if (test){
p_value = test_powerlaw(data, N, alpha, xmin, ks, num_test, fit_func);
printf("%g %g %g %g\n", alpha, xmin/scaling_fact, ks, p_value);
}
else{
printf("%g %g %g\n", alpha, xmin / scaling_fact, ks);
}
free(data);
}