Source code for seak.lrt

"""Contains classes and functions for Likelihood ratio tests.

The class :class:`LRTnoK` implements the set-based likelihood ratio test for continuous phenotypes. A background kernel is not supported.

Null model:

.. math:: y = {\\alpha} X + {\\epsilon}

Alternative model:

.. math:: y = {\\alpha} X + {\\gamma} G_1 + {\\epsilon}

With: :math:`X`: covariates, dimension :math:`nxc` (:math:`n:=` number of individuals, :math:`c:=` number of covariates),
:math:`G_1`: set of variants to test, dimensions :math:`nxk` (:math:`n:=` number of individuals, :math:`k:=` number of variants in set to test association for).

"""

import numpy as np
from numpy.lib.scimath import logn
import scipy as sp
from scipy.stats import chi2
from scipy.linalg import eigh
from numpy.linalg import svd, LinAlgError

import logging

import fastlmm.util.mingrid as mingrid
from fastlmm.util.stats import linreg
from fastlmm.inference import lmm

from seak.optimizer import optimize_lambda


[docs]def make_lambdagrid(lambda0, gridlength, log_grid_lo, log_grid_hi): ''' Port of the R-function to create the lambda grid (used inside RLRTsim). Creates a grid of lambda values to optimize over. ''' if lambda0 == 0: grid = np.exp(np.linspace(log_grid_lo, log_grid_hi, num=gridlength - 1)) return np.concatenate([np.zeros(1), grid]) else: # Note: I'm not sure this was ever used recently, as there seems to be a bug in the R-code (?) leftratio = np.min([np.max([np.log(lambda0) / (log_grid_hi - log_grid_lo), 2.]), 0.8]) leftlength = np.max(np.round(leftratio * gridlength) - 1, 2) leftdistance = lambda0 - np.exp(log_grid_lo) # make sure leftlength doesn't split the left side into too small parts: if leftdistance < (leftlength * 10 * np.finfo(float).eps): leftlength = np.max([np.round(leftdistance / (10 * np.finfo(float).eps)), 2]) # leftdistance approx. 1 ==> make a regular grid, since # (1 +- epsilon)^((1:n)/n) makes a too concentrated grid if (np.abs(leftdistance - 1) < 0.3): leftgrid = np.linspace(np.exp(log_grid_lo), lambda0, num=leftlength + 1)[:-1] else: leftdiffs = np.where([leftdistance > 1] * leftlength - 1, leftdistance ** (np.arange(2, leftlength + 1) / leftlength) - leftdistance ** (1. / leftlength), leftdistance ** (np.arange(leftlength - 1, 1) - leftdistance ** leftlength)) leftgrid = lambda0 - leftdiffs[::-1] rightlength = gridlength - leftlength rightdistance = np.exp(log_grid_hi) - lambda0 rightdiffs = rightdistance ** (np.arange(2, rightlength + 1) / rightlength - rightdistance ** (1 / rightlength)) rightgrid = lambda0 + rightdiffs return np.concatenate([np.zeros(1), leftgrid, lambda0, rightgrid])
[docs]def rlrsim_loop(p, k, n, nsim, g, q, mu, lambd, lambda0, xi=None, REML=True): ''' Python port of the RLRsim.cpp function using scipy & numpy This is a "naive" port with the same loops used in the C++ code It's recommended to use rlrsim() instead, as it's much faster. This function is mainly here for documentation purposes. See :func:`rlrsim` ''' dfChiK = n - p - k if (n - p - k) > 0 else 0 if REML: n0 = n - p xi = mu else: assert xi is not None, "xi can't be None if REML == False !" n0 = n # pre-compute stuff that stays constant over simulations # broadcast to shape (g, k) lambdamu = lambd[:, np.newaxis] * mu lambdamuP1 = lambdamu + 1. # broadcast to shape (g, k) fN = ((lambd - lambda0)[:, np.newaxis] * mu) / lambdamuP1 fD = (1 + lambda0 * mu) / lambdamuP1 # shape (g,) sumlog1plambdaxi = np.sum(np.log1p(lambd[:,np.newaxis] * xi), axis=1) # simulations res = np.zeros(nsim) lambdaind = np.zeros(nsim, dtype=int) # replace this loop with map? for i_s in range(nsim): LR = 0. ChiSum = 0 ChiK = chi2(dfChiK).rvs(1) # Xnpk Chi1 = chi2(1).rvs(k) # ws^2 if not REML: ChiSum = Chi1.sum() # loop over lambda values: for i_g in range(g): N = (fN[i_g, :] * Chi1).sum() D = (fD[i_g, :] * Chi1).sum() D += ChiK LR = n0 * np.log1p(N / D) - sumlog1plambdaxi[i_g] if LR >= res[i_s]: res[i_s] = LR lambdaind[i_s] = i_g else: break if not REML: res[i_s] = res[i_s] + n * np.log1p(chi2(q).rvs(1) / (ChiSum + ChiK)) result = {'res' : res, 'lambdaind': lambdaind, 'lambdamu': lambdamu, 'fN': fN, 'fD': fD, 'sumlog1plambdaxi': sumlog1plambdaxi, 'n': n, 'p': p, 'dfChik': dfChiK, 'REML': REML } return result
[docs]def rlrsim_np(p, k, n, nsim, g, q, mu, lambd, lambda0, xi=None, REML=True): ''' Python port of the RLRsim.cpp function using scipy & numpy This version uses broadcasting to speed up operations. It uses a lot of memory for large nsim, but is way faster than rlrsim_loop() :param int p: number of covariates in X :param int k: number of variables to test :param int n: number of individuals :param int nsim: number of sumulations to perform :param int g: lambda grid length (not used anymore...) :param int q: number of free parameters in the null-model (?) :param numpy.ndarray mu: array of eigenvalues :param numpy.ndarray lambd: array of lambda values :param float lambda0: lambda0 value :xi: ? :param bool REML: perform REML (default=True) ''' dfChiK = n - p - k if (n - p - k) > 0 else 0 if REML: n0 = n - p xi = mu else: assert xi is not None, "xi can't be None if REML == False !" n0 = n # pre-compute stuff that stays constant over simulations # broadcast to shape (g, k) lambdamu = lambd[:, np.newaxis] * mu # (g, k) lambdamuP1 = lambdamu + 1. # broadcast to shape (g, k) fN = ((lambd - lambda0)[:, np.newaxis] * mu) / lambdamuP1 # (g, k) fD = (1 + lambda0 * mu) / lambdamuP1 # shape (g,) sumlog1plambdaxi = np.sum(np.log1p(lambd[:, np.newaxis] * xi), axis=1) # simulations # need nsim values of ChiK # shape (nsim, ) ChiK = chi2(dfChiK).rvs(nsim) # need (nsim, k) values of Chi1 Chi1 = np.reshape(chi2(1).rvs(nsim * k), (nsim, 1, k)) if not REML: # need (nsim, ) values of ChiSum ChiSum = Chi1.sum(axis=1) # fN (g, k) * Chi1 (nsim, 1, k) -> N (nsim, g, k) -sum-> N (nsim, g) N = np.sum(fN * Chi1, axis=2) # fD (g, k) * Chi1 (nsim, 1, k) -> D (nsim, g, k) -sum-> D (nsim, g) D = np.sum(fD * Chi1, axis=2) # D (nsim, g) + Chik(nsim,) -> D (nsim, g) D += ChiK[:,np.newaxis] # n0 (1,) * log1p(N/D) (nsim, g) -> LR (nsim, g) LR = n0 * np.log1p(N / D) # LR (nsim, g) - ...(g,) -> LR (nsim, g) LR -= sumlog1plambdaxi # (nsim, g) -> lambdaind (nsim,) lambdaind = np.argmax(LR, axis=1) # (nsim, g) -> res (nsim,) res = LR[np.arange(LR.shape[0]), lambdaind] if not REML: res += n * np.log1p(chi2(q).rvs(nsim) / (ChiSum + ChiK)) result = { 'res': res, 'lambdaind': lambdaind, 'lambdamu': lambdamu, 'fN': fN, 'fD': fD, 'sumlog1plambdaxi': sumlog1plambdaxi, 'n': n, 'p': p, 'dfChik': dfChiK, 'REML': REML } return result
[docs]def rlrsim(p, k, n, nsim, g, q, mu, lambd, lambda0, xi=None, REML=True): ''' Python port of the RLRsim.cpp function using scipy & numpy This version uses broadcasting + cython to speed up operations. It is about 3-10 times faster than the numpy-version (rlrsim_np). :param int p: number of covariates in X :param int k: number of variables to test :param int n: number of individuals :param int nsim: number of sumulations to perform :param int g: lambda grid length :param int q: number of free parameters in the null-model (?) :param numpy.ndarray mu: array of eigenvalues :param numpy.ndarray lambd: array of lambda values :param float lambda0: lambda0 value :xi: ? :param bool REML: perform REML (default=True) ''' dfChiK = n - p - k if (n - p - k) > 0 else 0 if REML: n0 = n - p xi = mu else: assert xi is not None, "xi can't be None if REML == False !" n0 = n # pre-compute stuff that stays constant over simulations # g is the axis along lambda values that we optimize over # broadcast to shape (g, k) lambdamu = lambd[:, np.newaxis] * mu # (g, k) lambdamuP1 = lambdamu + 1. # broadcast to shape (g, k) fN = ((lambd - lambda0)[:, np.newaxis] * mu) / lambdamuP1 fD = (1 + lambda0 * mu) / lambdamuP1 # shape (g,) sumlog1plambdaxi = np.sum(np.log1p(lambd[:, np.newaxis] * xi), axis=1) # simulations # need nsim values of ChiK (nsim, ) ChiK = chi2(dfChiK).rvs(nsim) # need (nsim, k) values of Chi1 Chi1 = np.reshape(chi2(1).rvs(nsim * k), (nsim, k)) if not REML: # need (nsim, ) values of ChiSum ChiSum = Chi1.sum(axis=1) else: ChiSum = np.zeros((nsim,), dtype=float) # this is the part that we want to optimize: res = np.zeros(nsim, dtype=float) lambdaind = np.zeros(nsim, dtype=int) optimize_lambda(res, lambdaind, nsim, g, n0, ChiSum, ChiK, Chi1, sumlog1plambdaxi, fN, fD) if not REML: res += n * np.log1p(chi2(q).rvs(nsim) / (ChiSum + ChiK)) result = { 'res': res, 'lambdaind': lambdaind, 'lambdamu': lambdamu, 'fN': fN, 'fD': fD, 'sumlog1plambdaxi': sumlog1plambdaxi, 'n': n, 'p': p, 'dfChik': dfChiK, 'REML': REML } return result
[docs]def RLRTSim(X, Z, Xdagger, sqrtSigma=None, lambda0=np.nan, seed=2020, nsim=10000, use_approx=0, log_grid_hi=8, log_grid_lo=-10, gridlength=200): ''' Python port of the LRTSim function using scipy & numpy see https://cran.r-project.org/web/packages/RLRsim/RLRsim.pdf for details, also Crainiceanu, C. and Ruppert, D. (2004) Likelihood ratio tests in linear mixed models with one variance component :param numpy.ndarray X: The fixed effects design matrix of the model under the alternative :param numpy.ndarray Z: The random effects design matrix of the model under the alternative (i.e. G_1) :param numpy.ndarray Xdagger: Moore-Penrose pseudo-inverse of X :param numpy.ndarray sqrtSigma: The upper triangular Cholesky factor of the correlation matrix of the random effect :param numpy.ndarray lambda0: :param seed: numpy random seed to use :param int nsim: Number of values to simulate :param int use_approx: Not implemented (yet) :param log_grid_hi: Higher value of the grid on the log scale. :param log_grid_lo: Lower value of the grid on the log scale. :param int gridlength: Length of the grid for the grid search over lambda :return: A dictionary containing the simulations and other parameters :rtype: dict Output dictionary: "res": np.array of simulated test statistics ...: parameters and intermediate variables used to generate simulated test statistics ''' # this is a port of the R-function ... # sqrtSigma is the upper triangular factor of the cholesky decomposition if np.isnan(lambda0): lambda0 = 0 if lambda0 > np.exp(log_grid_hi): log_grid_hi = np.log(10 * lambda0) # print warning if (lambda0 != 0) & (lambda0 < np.exp(log_grid_lo)): log_grid_lo = np.log(-10 * lambda0) # print warning if seed is not None: # is this safe? np.random.seed(seed) n, p = np.shape(X) K = min(n, Z.shape[1]) sqrtSigma = np.eye(Z.shape[1]) if sqrtSigma is None else sqrtSigma # project out X rZ = Z - X.dot(Xdagger.dot(Z)) np.matmul(rZ, sqrtSigma.T, out=rZ) try: mu = svd(rZ, full_matrices=False, compute_uv=False) if np.any(mu < -0.1): logging.warning("kernel contains a negative Eigenvalue") mu *= mu except LinAlgError: # revert to Eigenvalue decomposition logging.warning( "Got SVD exception, trying eigenvalue decomposition of square of Z. Note that this is a little bit less accurate") mu_ = eigh(rZ.T.dot(rZ), eigvals_only=True) if np.any(mu_ < -0.1): logging.warning("kernel contains a negative Eigenvalue") mu = mu_ * mu_ #is it ok if some are 0? # normalize mu /= np.max(mu) if use_approx != 0: raise NotImplementedError # else: # the original has different approximate procedures that could be re-implemented here lambda_grid = make_lambdagrid(lambda0, gridlength, log_grid_lo, log_grid_hi) res = rlrsim(p=p, k=K, n=n, nsim=nsim, g=gridlength, q=0, mu=mu, lambd=lambda_grid, lambda0=lambda0, xi=mu, REML=True) res['mu'] = mu res['lambda'] = lambda_grid[res['lambdaind']] if 'lambdaind' in res else np.zeros((1,)) return res
[docs]def estimate_pointmass0_null(mu, n, p, nsim=100000, xi=None, seed=None, REML=True): ''' The point mass of the null distribution of the (R)LRT at 0 can be approximated by the probability of the (R)LRT test statistic having a local maximum at 0. See Crainiceanu, Ruppert: Likelihood ratio tests in linear mixed models with one variance component (2003), Equations (8) & (9) and Zhou: Boosting Gene Mapping Power and Efficiency with Efficient Exact Variance Component Tests of Single Nucleotide Polymorphism Sets (2016) :param numpy.ndarray mu: Eigenvalues of the Genotype matrix with covariates projected out :param float n: Number of individuals :param float p: Number of fixed effect parameters (covariates) :param float nsim: Number of simulations to perform (default: 100,000) :param numpy.ndarray xi: Eigenvalues of the Genotype matrix, if REML == False :param int: numpy seed :param REML: use REML? (default: True) :return: The probability of having point-mass at null :rtype: float ''' if REML: n0 = n - p xi = mu else: assert xi is not None, "xi can't be None if REML == False !" n0 = n k = len(mu) if seed is not None: np.random.seed(seed) lhs = np.reshape(chi2(1).rvs(nsim * k), (nsim, k)).dot(mu) / chi2(n0).rvs(nsim) if REML: rhs = 1 / (n - p) * mu.sum() else: rhs = 1 / n * xi.sum() return (lhs <= rhs).sum() / nsim
[docs]def pv_chi2mixture(stat, scale, dof, mixture, alteqnull=None): ''' Returns p-values(s) using chi2 with custom parameters. see :func:`fit_chi2mixture` wraps chi2.sf(stat/scale, dof) * mixture :param stat: The LRT test statistic :type stat: Union[float, np.array] :param float scale: scaling factor applied to the test statistic :param float dof: degrees of freedom :param float mixture: mixture weight :return: p-value :rtype: float ''' if alteqnull is None: alteqnull = stat == 0. pv = np.array(chi2.sf(stat/scale, dof) * mixture) pv[alteqnull] = 1. return pv
[docs]class chi2mixture(object): ''' mixture here denotes the weight on the non-zero dof compnent ''' __slots__ = ['scale', 'dof', 'mixture', 'imax', 'lrt', 'scalemin', 'scalemax', 'dofmin', 'dofmax', 'qmax', 'tol', 'isortlrt', 'qnulllrtsort', 'lrtsort', 'alteqnull', 'abserr', 'fitdof'] def __init__(self, lrt, mixture=None, tol=0.0, scalemin=0.1, scalemax=5.0, dofmin=0.1, dofmax=5.0, qmax=None, alteqnull=None, abserr=None, fitdof=None, dof=None): ''' Direct adaptation of FaST-LMMs chi2mixture class. The only difference is that mixture can be passed as an argument. Input: lrt [Ntests] vector of test statistics a2 (optional) [Ntests] vector of model variance parameters top (0.0) tolerance for matching zero variance parameters or lrts scalemin (0.1) minimum value used for fitting the scale parameter scalemax (5.0) maximum value used for fitting scale parameter dofmin (0.1) minimum value used for fitting the dof parameter dofmax (5.0) maximum value used for fitting dof parameter qmax (None) only the top qmax quantile is used for the fit ''' self.lrt = lrt self.alteqnull = alteqnull self.scale = None self.dof = dof self.mixture = mixture self.scalemin = scalemin self.scalemax = scalemax self.dofmin = dofmin self.dofmax = dofmax self.qmax = qmax self.tol = tol self.__fit_mixture() self.isortlrt = None self.abserr = abserr self.fitdof = fitdof def __fit_mixture(self): ''' fit the mixture component ''' if self.tol < 0.0: logging.info('tol has to be larger or equal than zero.') if self.alteqnull is None: self.alteqnull = self.lrt == 0 logging.info("WARNING: alteqnull not provided, so using alteqnull=(lrt==0)") if self.mixture is None: self.mixture = 1.0 - (np.array(self.alteqnull).sum() * 1.0) / (np.array(self.alteqnull).shape[0] * 1.0) return self.alteqnull, self.mixture
[docs] def fit_params_Qreg(self): ''' Fit the scale and dof parameters of the model by minimizing the squared error between the model log quantiles and the log P-values obtained on the lrt values. Only the top qmax quantile is being used for the fit (self.qmax is used in fit_scale_logP). ''' if self.isortlrt is None: self.isortlrt = self.lrt.argsort()[::-1] self.qnulllrtsort = (0.5 + np.arange(self.mixture * self.isortlrt.shape[0])) / (self.mixture * self.isortlrt.shape[0]) self.lrtsort = self.lrt[self.isortlrt] resmin = [None] # CL says it had to be a list or wouldn't work, even though doesn't make sense if self.fitdof: # fit both scale and dof def f(x): res = self.fit_scale_logP(dof=x) if (resmin[0] is None) or (res['mse'] < resmin[0]['mse']): resmin[0] = res return res['mse'] else: def f(x): # fit only scale scale = x mse, imax = self.scale_dof_obj(scale, self.dof) if (resmin[0] is None) or (resmin[0]['mse'] > mse): resmin[0] = { # bookeeping for CL's mingrid.minimize1D 'mse': mse, 'dof': self.dof, 'scale': scale, 'imax': imax, } return mse min = mingrid.minimize1D(f=f, nGrid=10, minval=self.dofmin, maxval=self.dofmax) self.dof = resmin[0]['dof'] self.scale = resmin[0]['scale'] self.imax = resmin[0]['imax'] return resmin[0]
[docs] def fit_scale_logP(self, dof=None): ''' Extracts the top qmax lrt values to do the fit. ''' if dof is None: dof = self.dof resmin = [None] def f(x): scale = x err, imax = self.scale_dof_obj(scale, dof) if (resmin[0] is None) or (resmin[0]['mse'] > err): resmin[0] = { # bookeeping for CL's mingrid.minimize1D 'mse': err, 'dof': dof, 'scale': scale, 'imax': imax, } return err min = mingrid.minimize1D(f=f, nGrid=10, minval=self.scalemin, maxval=self.scalemax) return resmin[0]
[docs] def scale_dof_obj(self, scale, dof): base = np.exp(1) # fitted params are invariant to this logarithm base (i.e.10, or e) nfalse = (len(self.alteqnull) - np.sum(self.alteqnull)) imax = int(np.ceil(self.qmax * nfalse)) # of only non zer dof component p = sp.stats.chi2.sf(self.lrtsort[0:imax] / scale, dof) logp = logn(base, p) r = logn(base, self.qnulllrtsort[0:imax]) - logp if self.abserr: err = np.absolute(r).sum() else: # mean square error err = (r * r).mean() return err, imax
[docs]def fit_chi2_moment_matching(sims): """ Fits a chi2 distribution using moment matching (as in Zhou 2016) This has not been tested and is not recommended. :param sims: array of LRT test statistics (continuous part of the distribution) :return: Dictionary of distribution parameters estimated with Moment Matching (MM) """ scale = np.var(sims) / (2 * np.mean(sims)) dof = 2 * (np.mean(sims) ** 2) / np.var(sims) return {'scale': scale, 'dof': dof}
[docs]def fit_chi2_maximum_likelihood(sims): """ Fits a chi2 distribution using Maximum Likelihood. (wraps the sci) This has not been tested and is not recommended. :param sims: array of LRT test statistics (continuous part of the distribution) :return: Dictionary of distribution parameters estimated with Maximum Likelihood (ML) """ dof, _, scale = chi2.fit(sims, floc=0.) return {'scale': scale, 'dof': dof}
[docs]def fit_chi2mixture(sims, mixture=None, qmax=0.1): ''' Takes simulated test statistics as input. Fits a chi2 mixture to them using "quantile regression" (Listgarten 2013). returns a dictionary.When mixture parameter is given it only fits scale and dof. This is the preferred method. :param numpy.ndarray sims: array of tests statistics :param float mixture: mixture weight :param float qmax: fraction of largest test statistics to fit the distribution with :return: Dictionary with chi2 mixture parameters :rtype: dict Output dictionary: "mse" : mean squared error "dof" : degrees of freedom "scale" : scale "imax" : number of values used to fit the ditribution ''' mix = chi2mixture(lrt=sims, qmax=qmax, mixture=mixture, fitdof=True) # method described in LRT paper res = mix.fit_params_Qreg() res['mixture'] = mix.mixture return res
[docs]class LRTnoK(): '''Single kernel Likelihood Ratio Test (LRT) for continuous phenotypes. Sets up null model for given phenotypes and covariates. Use :func:`altmodel` to fit an alternative model, and calculate the LRT test statistic. Use :func:`pv_sim` to simulate test statistics and generate empirical p-values. :param numpy.ndarray X: :math:`nxc` matrix containing the covariates (:math:`n:=` number of individuals, :math:`c:=` number of covariates) :param numpy.ndarray Y: :math:`nx1` matrix containing the phenotype (:math:`n:=` number of individuals) :param bool REML: use restricted maximum likelihood? (only "True" has been tested) :ivar X: X :ivar Y: Y :ivar Xdagger: Moore-Penrose pseudo-inverse of X :ivar model0: dictionary with nLL of the null model :ivar model1: dictionary with nLL and other parameters of the alternative model ''' def __init__(self, X, Y, REML=True): self.X = X if np.ndim(Y) == 1: self.Y = Y[:, np.newaxis] elif np.ndim(Y) == 2: self.Y = Y else: raise ValueError("Can't handle multi-dimensional Y!") self.Xdagger = np.linalg.pinv(X) # needed for rlrsim self._nullmodel(REML=REML) self.model1 = None self.likalt={} self.REML = REML def _nullmodel(self, REML=True): # initialize the null model self.model0 = {} model = linreg(self.X, self.Y[:,0]) self.model0['h2'] = np.nan self.model0['nLL'] = model['nLL']
[docs] def altmodel(self, G1): """Fit alternative model. Returns a dictionary with model parameters and test statistic. :param numpy.ndarray G1: :math:`G_1`: set of variants (i.e. variables) to test, dimensions :math:`nxk` (:math:`n:=` number of individuals, :math:`k:=` number of variants in set to test association for) :return: Dictionary of fitted model parameters :rtype: dict Output dictionary: 'nLL' : negative log-likelihood 'sigma2' : the model variance sigma^2 'stat' : LRT test statistic 'alteqnull' : h2 is 0 or negative (i.e. p-value is 1.) 'beta' : [D*1] array of fixed effects weights beta 'h2' : mixture weight between Covariance and noise 'REML' : True: REML was computed, False: ML was computed 'a2' : mixture weight between K0 and K1 'scale' : Scale parameter that multiplies the Covariance matrix (default 1.0) -------------------------------------------------------------------------- """ # use FaST-LMM's implementation lmm1 = lmm.LMM() lmm1.setG(G1) lmm1.setX(self.X) lmm1.sety(self.Y[:,0]) lik1 = lmm1.findH2() #The alternative model has one kernel and needs to find only h2 lik1['alteqnull'] = lik1['h2'] <= 0.0 lik1['stat'] = 2.0*(self.model0['nLL'] - lik1['nLL']) self.model1 = lmm1 self.model1_lik = lik1 return lik1
[docs] def pv_5050(self, lik1=None): """ Return p-value(s) calculated assuming a 50/50 mixture of chi2 df0 and chi2 df1. Considered a conservative estimate. If lik1 is None, returns the p-value for the current alternative model. :param lik1: likelihood ratio test statistic :type lik1: Union[float, numpy.ndarray] :rtype: Union[float, numpy.ndarray] """ if lik1 is None: lik1 = self.model1_lik if lik1['alteqnull']: pv = 1. else: pv = chi2(1).sf(lik1['stat']) * 0.5 return pv
[docs] def pv_sim(self, nsim=100000, seed=420): ''' Runs "nsim" simulations of the test statistic IF the likelihood of the alternative model is greater than that of the null model, i.e. self.model_lik['alteqnull'] is False. returns a dictionary with the simulations and empirical p-value. The empirical p-value is 1. if self.model_lik['alteqnull'] is True, and the array of simulated test statistics will be empty. :param int nsim: Number of simulated test statistics to return :param int seed: numpy random seed to use for simulations :return: Dictionary with containing the simulated test statistics, and empirical p-value :rtype: dict Output dictionary: "res": array of simulated test statistics "pv": empirical p-value of current alternative model ''' lik1 = self.model1_lik if lik1['alteqnull']: pv = 1. result = { 'res': np.array([]), 'pv': pv } return result else: sims = RLRTSim(self.X, self.model1.G, self.Xdagger, nsim=nsim, seed=seed) pv = np.mean(lik1['stat'] < sims['res']) sims['pv'] = pv return sims
[docs] def pv_sim_chi2(self, nsim=300, nsim0=100000, simzero=True, method='QR', seed=420, qmax=0.1): """ Calculate a gene-specific p-value by simulating nsim test statistics and estimating the mixture distribution as in Zhou 2016. :param int nsim: Number of simulations for the (R)LRT test statistic :param int nsim0: Number of simulations to estimate point-mass at 0 :param str method: Method used to fit continuous part of the the chi2 mixure distribution. 'MM' -> moment matching, 'ML' -> maximum likelihood or 'QR' -> Quantile regression :param float qmax: Use only the values in the top qmax quantile to fit the chi2 distribution. This only affects the QR-method :param bool simzero: Whether to separately run simulations for the estimation of the mixture parameter (default: True). This makes sense if nsim0 >> nsim :return: dictionary similar to that returned by pv_sim(), but with additional slots for estimated dof, scale and mixture parameters for the chi2 distribution """ lik1 = self.model1_lik if lik1['alteqnull']: pv = 1. result = { 'res': np.array([]), 'pv': pv, 'mixture': 0. } return result else: sims = RLRTSim(self.X, self.model1.G, self.Xdagger, nsim=nsim, seed=seed) if simzero: mass0 = estimate_pointmass0_null(sims['mu'], sims['n'], sims['p'], nsim=nsim0, seed=seed*2, REML=self.REML) else: mass0 = (sims['res'] == 0.).sum() / nsim if method == 'MM': logging.warning('Method "MM" (Moment Matching) is not tested and may not be accurate.') nonzero = sims['res'] != 0. chi2param = fit_chi2_moment_matching(sims['res'][nonzero]) imax = nonzero.sum() elif method == 'ML': logging.warning('Method "ML" (Maximum Likelihood) is not tested and may not be accurate.') nonzero = sims['res'] != 0. chi2param = fit_chi2_maximum_likelihood(sims['res'][nonzero]) imax = nonzero.sum() elif method == 'QR': chi2param = fit_chi2mixture(sims['res'], qmax=qmax, mixture=1.-mass0) imax = chi2param['imax'] else: raise NotImplementedError('Method "{}" not implemented, use either "MM", "ML" or "QR"'.format(method)) pv = pv_chi2mixture(lik1['stat'], scale=chi2param['scale'], dof=chi2param['dof'], mixture=1.-mass0) sims['pv'] = pv sims['scale'] = chi2param['scale'] sims['dof'] = chi2param['dof'] sims['mixture'] = 1-mass0 sims['imax'] = imax return sims