Source code for seak.kernels

"""

Functions that can incorporate prior knowledge into set-association tests.

Any function that takes a genotype matrix G, variant effects V, and optional additional arguments and returns a set of
transformed genotypes/variables to be tested.

"""

# imports
import numpy as np
from sklearn.cluster import AgglomerativeClustering


# Kernel functions
[docs]def diffscore_max(G, V, scale_sqrt=True): """Uses the largest absolute variant effect predictions (veps) per SNV. Linear weighted kernel. :param numpy.ndarray G: SNVs to be tested (genotypes), :math:`nxm` (:math:`n:=` number of individuals, :math:`m:=` number of SNVs) :param numpy.ndarray V: veps :math:`mxk` (:math:`m:=` number of SNVs, :math:`k:=` number of veps); dimension :math:`m` needs to be equal in G and V :return: linear weighted genotype matrix (:math:`GxV`) :rtype: numpy.ndarray """ V = np.abs(V) if scale_sqrt: V = np.sqrt(V) # Maximum out of any diffscore prediction V = np.amax(V, axis=1) V = np.diag(V, k=0) return np.matmul(G, V)
[docs]def single_column_kernel(i, scale_sqrt=True): """Returns a kernel function with weights that correspond to a single column of the veps numpy.ndarray. :param int i: column of which weights should be selected :return: kernel function that linearly weights the genotypes with a single column of veps (e.g. corresponding to a specific transcription factor), needs to be called with genotypes and veps array :rtype: function """ def skk(G, V): """Scales the SNVs by the variant effect prediction (vep) corresponding to a single column of the vep file scaled by a factor of 100 as linear weights. Linear weighted kernel. :param numpy.ndarray G: SNVs to be tested (genotypes), :math:`nxm` (:math:`n:=` number of individuals, :math:`m:=` number of SNVs) :param numpy.ndarray V: veps :math:`mxk` (:math:`m:=` number of SNVs, :math:`k:=` number of veps); dimension :math:`m` needs to be equal in G and V :return: linear weighted genotype matrix (:math:`GxV`) :rtype: numpy.ndarray """ V = V[:, i] V = np.abs(V) if scale_sqrt: V = np.sqrt(V) V = np.diag(V, k=0) return np.matmul(G, V) return skk
[docs]def linear(G, V): """ Linear kernel, does not scale the genotypes.""" return G
[docs]def linear_weighted(G, V): """Linear weighted kernel. Scales the SNVs by the variant effect predictions (veps) corresponding of a 1-dimensional array which gets transformed into a diagonal weight matrix as linear weights. :param numpy.ndarray G: SNVs to be tested (genotypes), :math:`nxm` (:math:`n:=` number of individuals, :math:`m:=` number of SNVs) :param numpy.ndarray V: veps :math:`mx1` (:math:`m:=` number of SNVs) with dimension 1 and shape: :math:`m` :return: linear weighted genotype matrix (:math:`GxV`) :rtype: numpy.ndarray """ V = np.diag(V, k=0) return np.matmul(G, V)
[docs]class LocalCollapsing: """Class for local collapsing of GV, based on a set of positions :param float distance_threshold: maximum distance allowed between SNPs within the same cluster """ def __init__(self, distance_threshold=100.): self.clust = AgglomerativeClustering(n_clusters=None, affinity='manhattan', linkage='complete', distance_threshold=distance_threshold)
[docs] def collapse(self, G, pos, weights=1.): """ Collapses G locally, based on pos :param np.ndarray G: Matrix containing the values to be collapsed, typically shape = (n_individuals, n_SNP) :param np.ndarray pos: array-like, passed to sklearn.cluster.AgglomerativeClustering typically shape (n_samples, 1) for base pair positional clustering, but supports (n_samples, n_features) or (n_samples, n_samples). :param np.array weights: single variant weights, shape = (n_SNP, ), default: equal weights for all variants :return: Collapsed matrix with shape (n_individuals, n_clusters), and the cluster assignments (n_individuals, ) :rtype: tuple (np.ndarray, np.ndarray) """ n_snp = G.shape[1] if pos.ndim == 1: pos = pos.reshape((-1, 1)) clusters = self.clust.fit_predict(pos) C = np.zeros((n_snp, self.clust.n_clusters_)) C[np.arange(n_snp),clusters] = weights return np.matmul(G,C), clusters