Source code for powerlaw.regression

import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
from scipy.special import zeta

from distribution import frequency_distribution, powerlaw_series, random_series
from utils import unique

from math import pow, e, log, sqrt
import sys
import random


[docs]def least_square_regression(x, y, xlabel = "x", ylabel = "y", prefix="", suffix=""):
    """

    Perform least square regression to find the best fit line and returns the slope of the line.

    **Parameters**
        x : List of values along x axis.
        y : List of values along y axis.   

    """

    X = np.asarray(x).reshape((len(x), 1))
    Y = np.asarray(y).reshape((len(y), 1))
    regr = linear_model.LinearRegression()
    regr.fit(X, Y)

    label_string = "Best fit line, y = "+str(regr.coef_[0][0])+" * x + "+str(regr.intercept_[0])
    print label_string
    print "Residual sum of squares: %.2f" % np.mean((regr.predict(X) - Y) ** 2)
    print "Variance score: %.2f" % regr.score(X, Y)

    # Plot outputs
    original_data, = plt.plot(X, Y,'go', label="original data")
    # best_fit_line, = plt.plot(X, map(lambda x: pow(e, -x), X), 'bo', label=label_string)
    best_fit_line, = plt.plot(X, regr.predict(X), color='blue', linewidth=3, label=label_string)
    plt.title("Least Square Regression"+suffix)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)

    curves = [original_data, best_fit_line]
    labels = [curve.get_label() for curve in curves]

    plt.legend(curves, labels)
    plt.savefig(prefix+"least_square_regression_fit"+suffix+".png")
    plt.show()
    return regr.coef_[0][0]

[docs]def estimate_scaling_parameter(series, xmin = 1, discrete = False):
    
    """

    Perform Method of Maximum Liklihood (MLE) to find the best fit value of Alpha.

    **Parameters**
    
        series : series of data to be fit.
        xmin : Float/Integer, xmin for the distribution - assumed to be known before-hand. Default value is 1.0
        discrete : Boolean, whether to treat series as discrete or continous. Default value is False.

    **Returns**

        Estimated Alpha value.

    """
    
    normalizing_constant = 0.0
    if(discrete):
        normalizing_constant = 0.5
    partial_sum = 0.0
    count = 0.0
    # print series
    for x in series:
        partial_sum += log (x/(xmin - normalizing_constant))
        count+=1
    Alpha = 1.0 + count*(1/partial_sum) 
    return Alpha

[docs]def estimate_parameters(series, min_size_series = 50, discrete = False):
    """
    
    Apply Clauset et al.'s method to find the best fit value of xmin and Alpha.

    **Parameters**

        series : series of data to be fit.
        
        min_size_series : Minimum possible size of the distribution to which power-law fit will be attempted. Fitting power-law to a very small series would give biased results where power-law may appear to be a good fit even when data is not drawn from power-law distribution. The default value is taken to be 50 as suggested in the paper.

        discrete : Boolean, whether to treat series as discrete or continous. Default value is False

    **Returns**

        Tuple of (Estimated xmin, Estimated Alpha value, minimum KS statistics score).

    """

    sorted_series = sorted(series)
    xmin_candidates = []
    x_prev = sorted_series[0]
    xmin_candidates.append(x_prev)
    for x in sorted_series:
        if(x>x_prev):
            x_prev = x
            xmin_candidates.append(x_prev)

    ks_statistics_min = sys.maxint;
    xmin_result = 0
    Alpha_result = 2
    for xmin in xmin_candidates[:-1*(min_size_series-1)]: 
        data =  filter(lambda x: x>=xmin, sorted_series)
        estimated_Alpha = estimate_scaling_parameter(data, xmin)
        if(discrete):
            Px = [zeta(estimated_Alpha, x)/zeta(estimated_Alpha, xmin) for x in unique(data)]
        else:
            Px = [pow(float(x)/xmin, 1 - estimated_Alpha ) for x in unique(data)]
        n = len(Px)
        Sx = [i[1]/n for i in frequency_distribution(data, pdf=False)]
        ks_statistics = max( map (lambda counter: abs(Sx[counter] - Px[counter]) , range(0, n) ) )
        if(ks_statistics<ks_statistics_min):
            ks_statistics_min = ks_statistics
            xmin_result = xmin
            Alpha_result = estimated_Alpha

    return (xmin_result, Alpha_result, ks_statistics_min)

[docs]def generate_dataset(series, xmin, alpha, epsilon = 0.01):

    """
    
    Generator to generate datasets for goodness_of_fit test.

    **Parameters**

        series : series of data on which the power-law model was fitted.

        xmin : xmin for the fitted power-law model.

        alpha : alpha for the fitted power-law model.

        epsilon : desired accuracy in p-value. Default is set to 0.01

    **Returns**

        A generator to generate list of numbers (datasets).

    """
    number_of_datasets = int(round(0.25/(epsilon**2)) +1)
    print number_of_datasets
    n = len(series)
    non_powerlaw_series = filter(lambda x: x<xmin, series)
    ntail = n - len(non_powerlaw_series)
    p = float(ntail)/n
    # print p
    # print ntail
    # print n

    for i in range(0, number_of_datasets):
        dataset = []
        count_powerlaw_series = 0
        # how many numbers are to be picked from powerlaw distribution
        for random_number in random_series(n):
            if(random_number<=p):
                count_powerlaw_series+=1
                # generate number from power-law distribution
            else:
                # pick number from non_powerlaw_series
              dataset.append(random.choice(non_powerlaw_series))
        
        dataset = dataset + [i for i in powerlaw_series(Alpha = alpha, xmin = xmin, n = count_powerlaw_series)]

        yield dataset

[docs]def goodness_of_fit(series, xmin, alpha, ks_statistics, epsilon = 0.01, min_size_series = 50):

    """
    
    Function to calculate the p-value as a measure of goodness_of_fit for the fitted model.

    **Parameters**

        series : series of data on which the power-law model was fitted.

        xmin : xmin for the fitted power-law model.

        alpha : alpha for the fitted power-law model.

        ks_statistics : KS statistics for the fitted power-law model.

        epsilon : desired accuracy in p-value. Default is set to 0.01.

        min_size_series : Minimum possible size of the distribution to which power-law fit will be attempted. This value is used when fitting power-law to the generated datasets. The default value is taken to be 50. For further details, see `estimate_parameters()`.

    **Returns**

        p-value for the fitted model.

    """

    count_dataset = 0.0
    # number of synthetic datasets tested
    n1 = 0.0
    # number of synthetic datasets where ks value is greater than ks value for given data 
    for dataset in generate_dataset(series=series, xmin=xmin, alpha=alpha, epsilon=epsilon):
        count_dataset+=1.0
        (xmin_dataset, alpha_dataset, ks_statistics_dataset) = estimate_parameters(series=dataset, min_size_series = min_size_series)
        if(ks_statistics_dataset>ks_statistics):
            n1+=1.0
    return n1/count_dataset


if __name__ == "__main__":
    n = 10
    data = [i for i in powerlaw_series(n=n, xmin = 20, Alpha = 2.6)]
    # print data
    (xmin, alpha, ks_statistics) = estimate_parameters(series=data, min_size_series = 5)
    print "xmin = "+str(xmin)
    print "alpha = "+str(alpha)

    print goodness_of_fit(series=data, xmin=xmin, alpha=alpha, ks_statistics=ks_statistics, epsilon = 0.01, min_size_series = 50)