Source code for hierarc.Likelihood.KDELikelihood.kde_likelihood

__author__ = "martin-millon"

import os

import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity

import hierarc

LIKELIHOOD_TYPES = ["kde_hist_nd", "kde_full"]
_PATH_2_PLANCKDATA = os.path.join(os.path.dirname(hierarc.__file__), "Data", "Planck")


[docs] class KDELikelihood(object): """ KDE likelihood class. Provide a Chain object that will be used as your likelihood. __warning:: This class is not fully tested for more than 5 free parameters. Use at your own risk. __note:: Parameters need to be rescaled between 0 and 1 for this to work optimally. Think about rescaling your Chain with the "rescale = True" option. """ def __init__( self, chain, likelihood_type="kde_hist_nd", weight_type="default", kde_kernel="gaussian", bandwidth=0.01, nbins_hist=30, ): """ :param chain: (Likelihood.chain.Chain). Chain object to be evaluated with a kernel density estimator :param likelihood_type: (str). "kde_hist_nd" or "kde_full". Use "kde_hist_nd" in most cases as it is much faster and do not decrease much the precision :param weight_type: (str). Name of the weight to use. You can provide several type of weights for your samples. This is usefull when you importance sampling :param kde_kernel: (str). Kernel type to be passed to scikit-learn. Default : 'gaussian'. :param bandwidth: (float). Bandwidth of the kernel. Default : 0.01. Works well if parameters are rescaled between 0 and 1. :param nbins_hist: (float). Number of bins to use before fitting KDE. Used only if likelihood_type = 'kde_hist_nd'. """ self.chain = chain self.loglikelihood_type = likelihood_type self.weight_type = weight_type self.kde_kernel = kde_kernel self.bandwidth = bandwidth self.nbins_hist = nbins_hist self.init_loglikelihood()
[docs] def init_loglikelihood(self): """Initialisation of the KDE, depending on loglikelihood_type.""" if self.loglikelihood_type == "kde_full": self.kde = self.init_kernel_full( kde_kernel=self.kde_kernel, bandwidth=self.bandwidth ) self.loglikelihood = self.kdelikelihood() elif self.loglikelihood_type == "kde_hist_nd": self.kde = self.init_kernel_kdelikelihood_hist_nd( kde_kernel=self.kde_kernel, bandwidth=self.bandwidth, nbins_hist=self.nbins_hist, ) self.loglikelihood = self.kdelikelihood() else: raise NameError( "likelihood_type %s not supported! Supported are %s." % (self.loglikelihood_type, LIKELIHOOD_TYPES) )
[docs] def kdelikelihood(self): """Evaluates the likelihood. Return a function. __ warning:: you should adjust bandwidth to the spacing of your samples chain! """ return self.kde.score
[docs] def kdelikelihood_samples(self, samples): """Evaluates the likelihood on an array. Return an array. __ warning:: you should adjust bandwidth to the spacing of your samples chain! """ return self.kde.score_samples(samples)
[docs] def init_kernel_full(self, kde_kernel, bandwidth): """ :param kde_kernel: kde_kernel: (str). Kernel type to be passed to scikit-learn. Default : 'gaussian'. :param bandwidth: (float). Bandwidth of the kernel. Default : 0.01. Works well if parameters are rescaled between 0 and 1. :return: scikit-learn KernelDensity """ data = pd.DataFrame.from_dict(self.chain.params) kde = KernelDensity(kernel=kde_kernel, bandwidth=bandwidth).fit( data.values, sample_weight=self.chain.weights[self.weight_type] ) return kde
[docs] def init_kernel_kdelikelihood_hist_nd(self, kde_kernel, bandwidth, nbins_hist): """Evaluates the likelihood from a Kernel Density Estimator. The KDE is constructed using a binned version of the full samples. Greatly improves speed at the cost of a (tiny) loss in precision. __warning:: you should adjust bandwidth and nbins_hist to the spacing and size of your samples chain! __note:: nbins_hist refer to the number of bins per dimension. Hence, the final number of bins will be nbins_hist**n :param kde_kernel: kde_kernel: (str). Kernel type to be passed to scikit-learn. Default : 'gaussian'. :param bandwidth: (float). Bandwidth of the kernel. Default : 0.01. Works well if parameters are rescaled between 0 and 1. :param nbins_hist: (float). Number of bins to use before fitting KDE. Used only if likelihood_type = 'kde_hist_nd'. :return: scikit-learn KernelDensity """ samples = np.asarray( [self.chain.params[keys] for keys in self.chain.params.keys()] ) hist, edges = np.histogramdd(samples.T, bins=nbins_hist) edges = np.asarray(edges) dic = {} ndim = len(self.chain.params.keys()) keys = list(self.chain.params.keys()) for i, key in enumerate(keys): dic[key] = [ (p + edges[i, j + 1]) / 2.0 for j, p in enumerate(edges[i, :-1]) ] # for some reasons that are not obvious, ravel do not reverse meshgrid, indexing argument is really needed here mesh_tuple = np.meshgrid(*[dic[key] for key in keys], indexing="ij") meshdic = [] for i, key in enumerate(keys): meshdic.append(mesh_tuple[i].flatten()) meshdic = np.asarray(meshdic) kde_bins = pd.DataFrame(meshdic.T, columns=keys) kde_weights = np.ravel(hist) # remove the zero weights values kde_bins = kde_bins[kde_weights > 0] kde_weights = kde_weights[kde_weights > 0] # fit the KDE kde = KernelDensity(kernel=kde_kernel, bandwidth=bandwidth).fit( kde_bins.values, sample_weight=kde_weights ) return kde