Source code for hierarc.Likelihood.KDELikelihood.kde_likelihood

__author__ = "martin-millon"

import os

import numpy as np
import pandas as pd
from sklearn.neighbors import KernelDensity

import hierarc

LIKELIHOOD_TYPES = ["kde_hist_nd", "kde_full"]
_PATH_2_PLANCKDATA = os.path.join(os.path.dirname(hierarc.__file__), "Data", "Planck")



[docs]
class KDELikelihood(object):
    """
    KDE likelihood class. Provide a Chain object that will be used as your likelihood.
    __warning:: This class is not fully tested for more than 5 free parameters. Use at your own risk.

    __note:: Parameters need to be rescaled between 0 and 1 for this to work optimally. Think about rescaling your Chain
    with the "rescale = True" option.

    """

    def __init__(
        self,
        chain,
        likelihood_type="kde_hist_nd",
        weight_type="default",
        kde_kernel="gaussian",
        bandwidth=0.01,
        nbins_hist=30,
    ):
        """

        :param chain: (Likelihood.chain.Chain). Chain object to be evaluated with a kernel density estimator
        :param likelihood_type: (str). "kde_hist_nd" or "kde_full". Use "kde_hist_nd" in most cases as it is much
         faster and do not decrease much the precision
        :param weight_type: (str). Name of the weight to use. You can provide several type of weights for your samples.
         This is usefull when you importance sampling
        :param kde_kernel: (str). Kernel type to be passed to scikit-learn. Default : 'gaussian'.
        :param bandwidth: (float). Bandwidth of the kernel. Default : 0.01. Works well if parameters are rescaled
         between 0 and 1.
        :param nbins_hist: (float). Number of bins to use before fitting KDE. Used only if
         likelihood_type = 'kde_hist_nd'.
        """

        self.chain = chain
        self.loglikelihood_type = likelihood_type
        self.weight_type = weight_type
        self.kde_kernel = kde_kernel
        self.bandwidth = bandwidth
        self.nbins_hist = nbins_hist
        self.init_loglikelihood()


[docs]
    def init_loglikelihood(self):
        """Initialisation of the KDE, depending on loglikelihood_type."""
        if self.loglikelihood_type == "kde_full":
            self.kde = self.init_kernel_full(
                kde_kernel=self.kde_kernel, bandwidth=self.bandwidth
            )
            self.loglikelihood = self.kdelikelihood()
        elif self.loglikelihood_type == "kde_hist_nd":
            self.kde = self.init_kernel_kdelikelihood_hist_nd(
                kde_kernel=self.kde_kernel,
                bandwidth=self.bandwidth,
                nbins_hist=self.nbins_hist,
            )
            self.loglikelihood = self.kdelikelihood()
        else:
            raise NameError(
                "likelihood_type %s not supported! Supported are %s."
                % (self.loglikelihood_type, LIKELIHOOD_TYPES)
            )



[docs]
    def kdelikelihood(self):
        """Evaluates the likelihood. Return a function.

        __ warning:: you should adjust bandwidth to the spacing of your samples chain!
        """
        return self.kde.score



[docs]
    def kdelikelihood_samples(self, samples):
        """Evaluates the likelihood on an array. Return an array.

        __ warning:: you should adjust bandwidth to the spacing of your samples chain!
        """
        return self.kde.score_samples(samples)



[docs]
    def init_kernel_full(self, kde_kernel, bandwidth):
        """

        :param kde_kernel: kde_kernel: (str). Kernel type to be passed to scikit-learn. Default : 'gaussian'.
        :param bandwidth: (float). Bandwidth of the kernel. Default : 0.01. Works well if parameters are rescaled
         between 0 and 1.
        :return: scikit-learn KernelDensity
        """
        data = pd.DataFrame.from_dict(self.chain.params)
        kde = KernelDensity(kernel=kde_kernel, bandwidth=bandwidth).fit(
            data.values, sample_weight=self.chain.weights[self.weight_type]
        )
        return kde



[docs]
    def init_kernel_kdelikelihood_hist_nd(self, kde_kernel, bandwidth, nbins_hist):
        """Evaluates the likelihood from a Kernel Density Estimator. The KDE is
        constructed using a binned version of the full samples. Greatly improves speed
        at the cost of a (tiny) loss in precision.

        __warning:: you should adjust bandwidth and nbins_hist to the spacing and size of your samples chain!

        __note:: nbins_hist refer to the number of bins per dimension. Hence, the final number of bins will be nbins_hist**n

        :param kde_kernel: kde_kernel: (str). Kernel type to be passed to scikit-learn. Default : 'gaussian'.
        :param bandwidth: (float). Bandwidth of the kernel. Default : 0.01. Works well if parameters are rescaled between 0 and 1.
        :param nbins_hist: (float). Number of bins to use before fitting KDE. Used only if likelihood_type = 'kde_hist_nd'.
        :return: scikit-learn KernelDensity
        """
        samples = np.asarray(
            [self.chain.params[keys] for keys in self.chain.params.keys()]
        )
        hist, edges = np.histogramdd(samples.T, bins=nbins_hist)
        edges = np.asarray(edges)

        dic = {}
        ndim = len(self.chain.params.keys())
        keys = list(self.chain.params.keys())
        for i, key in enumerate(keys):
            dic[key] = [
                (p + edges[i, j + 1]) / 2.0 for j, p in enumerate(edges[i, :-1])
            ]

        # for some reasons that are not obvious, ravel do not reverse meshgrid, indexing argument is really needed here
        mesh_tuple = np.meshgrid(*[dic[key] for key in keys], indexing="ij")
        meshdic = []
        for i, key in enumerate(keys):
            meshdic.append(mesh_tuple[i].flatten())
        meshdic = np.asarray(meshdic)
        kde_bins = pd.DataFrame(meshdic.T, columns=keys)
        kde_weights = np.ravel(hist)

        # remove the zero weights values
        kde_bins = kde_bins[kde_weights > 0]
        kde_weights = kde_weights[kde_weights > 0]

        # fit the KDE
        kde = KernelDensity(kernel=kde_kernel, bandwidth=bandwidth).fit(
            kde_bins.values, sample_weight=kde_weights
        )
        return kde