Source code for xb.Spage_main

""" SpaGE [1]
@author: Tamim Abdelaal
This function integrates two single-cell datasets, spatial and scRNA-seq, and 
enhance the spatial data by predicting the expression of the spatially 
unmeasured genes from the scRNA-seq data.
The integration is performed using the domain adaption method PRECISE [2]
	
References
----------
    [1] Abdelaal T., Mourragui S., Mahfouz A., Reiders M.J.T. (2020)
    SpaGE: Spatial Gene Enhancement using scRNA-seq
    [2] Mourragui S., Loog M., Reinders M.J.T., Wessels L.F.A. (2019)
    PRECISE: A domain adaptation approach to transfer predictors of drug response
    from pre-clinical models to tumors
"""

import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

[docs]def SpaGE(Spatial_data,RNA_data,n_pv,genes_to_predict=None): """ @author: Tamim Abdelaal This function integrates two single-cell datasets, spatial and scRNA-seq, and enhance the spatial data by predicting the expression of the spatially unmeasured genes from the scRNA-seq data. Args: Spatial_data : Dataframe Normalized Spatial data matrix (cells X genes). RNA_data : Dataframe Normalized scRNA-seq data matrix (cells X genes). n_pv : int Number of principal vectors to find from the independently computed principal components, and used to align both datasets. This should be <= number of shared genes between the two datasets. genes_to_predict : str array list of gene names missing from the spatial data, to be predicted from the scRNA-seq data. Default is the set of different genes (columns) between scRNA-seq and spatial data. results: Imp_Genes: Dataframe Matrix containing the predicted gene expressions for the spatial cells. Rows are equal to the number of spatial data rows (cells), and columns are equal to genes_to_predict, . """ if genes_to_predict is SpaGE.__defaults__[0]: genes_to_predict = np.setdiff1d(RNA_data.columns,Spatial_data.columns) RNA_data_scaled = pd.DataFrame(data=st.zscore(RNA_data,axis=0), index = RNA_data.index,columns=RNA_data.columns) Spatial_data_scaled = pd.DataFrame(data=st.zscore(Spatial_data,axis=0), index = Spatial_data.index,columns=Spatial_data.columns) Common_data = RNA_data_scaled[np.intersect1d(Spatial_data_scaled.columns,RNA_data_scaled.columns)] Y_train = RNA_data[genes_to_predict] Imp_Genes = pd.DataFrame(np.zeros((Spatial_data.shape[0],len(genes_to_predict))), columns=genes_to_predict) pv_Spatial_RNA = PVComputation( n_factors = n_pv, n_pv = n_pv, dim_reduction = 'pca', dim_reduction_target = 'pca' ) pv_Spatial_RNA.fit(Common_data,Spatial_data_scaled[Common_data.columns]) S = pv_Spatial_RNA.source_components_.T Effective_n_pv = sum(np.diag(pv_Spatial_RNA.cosine_similarity_matrix_) > 0.3) S = S[:,0:Effective_n_pv] Common_data_projected = Common_data.dot(S) Spatial_data_projected = Spatial_data_scaled[Common_data.columns].dot(S) nbrs = NearestNeighbors(n_neighbors=50, algorithm='auto', metric = 'cosine').fit(Common_data_projected) distances, indices = nbrs.kneighbors(Spatial_data_projected) for j in range(0,Spatial_data.shape[0]): weights = 1-(distances[j,:][distances[j,:]<1])/(np.sum(distances[j,:][distances[j,:]<1])) weights = weights/(len(weights)-1) Imp_Genes.iloc[j,:] = np.dot(weights,Y_train.iloc[indices[j,:][distances[j,:] < 1]]) return Imp_Genes
""" Dimensionality Reduction @author: Soufiane Mourragui This module extracts the domain-specific factors from the high-dimensional omics dataset. Several methods are here implemented and they can be directly called from string name in main method method. All the methods use scikit-learn implementation. Notes ------- - References ---------- [1] Pedregosa, Fabian, et al. (2011) Scikit-learn: Machine learning in Python. Journal of Machine Learning Research """ import numpy as np from sklearn.decomposition import PCA, FastICA, FactorAnalysis, NMF, SparsePCA from sklearn.cross_decomposition import PLSRegression
[docs]def process_dim_reduction(method='pca', n_dim=10): """ Default linear dimensionality reduction method. For each method, return a BaseEstimator instance corresponding to the method given as input. Args: method: str, default to 'pca' Method used for dimensionality reduction. Implemented: 'pca', 'ica', 'fa' (Factor Analysis), 'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA). n_dim: int, default to 10 Number of domain-specific factors to compute. results: Classifier, i.e. BaseEstimator instance """ if method.lower() == 'pca': clf = PCA(n_components=n_dim) elif method.lower() == 'ica': print('ICA') clf = FastICA(n_components=n_dim) elif method.lower() == 'fa': clf = FactorAnalysis(n_components=n_dim) elif method.lower() == 'nmf': clf = NMF(n_components=n_dim) elif method.lower() == 'sparsepca': clf = SparsePCA(n_components=n_dim, alpha=10., tol=1e-4, verbose=10, n_jobs=1) elif method.lower() == 'pls': clf = PLS(n_components=n_dim) else: raise NameError('%s is not an implemented method'%(method)) return clf
[docs]class PLS(): """ Implement PLS to make it compliant with the other dimensionality reduction methodology. (Simple class rewritting). """ def __init__(self, n_components=10): self.clf = PLSRegression(n_components)
[docs] def get_components_(self): return self.clf.x_weights_.transpose()
[docs] def set_components_(self, x): pass
components_ = property(get_components_, set_components_)
[docs] def fit(self, X, y): self.clf.fit(X,y) return self
[docs] def transform(self, X): return self.clf.transform(X)
[docs] def predict(self, X): return self.clf.predict(X)
""" Principal Vectors @author: Soufiane Mourragui This module computes the principal vectors from two datasets, i.e.: - perform linear dimensionality reduction independently for both dataset, resulting in set of domain-specific factors. - find the common factors using principal vectors [1] This result in set of pairs of vectors. Each pair has one vector from the source and one from the target. For each pair, a similarity score (cosine similarity) can be computed between the principal vectors and the pairs are naturally ordered by decreasing order of this similarity measure. Example ------- Examples are given in the vignettes. Notes ------- Examples are given in the vignette References ---------- [1] Golub, G.H. and Van Loan, C.F., 2012. "Matrix computations" (Vol. 3). JHU Press. [2] Mourragui, S., Loog, M., Reinders, M.J.T., Wessels, L.F.A. (2019) PRECISE: A domain adaptation approach to transfer predictors of drug response from pre-clinical models to tumors """ import numpy as np import pandas as pd import scipy from pathlib import Path from sklearn.preprocessing import normalize
[docs]class PVComputation: """ Attributes: n_factors: int Number of domain-specific factors to compute. n_pv: int Number of principal vectors. dim_reduction_method_source: str Dimensionality reduction method used for source data. dim_reduction_target: str Dimensionality reduction method used for source data. source_components_ : numpy.ndarray, shape (n_pv, n_features) Loadings of the source principal vectors ranked by similarity to the target. Components are in the row. source_explained_variance_ratio_: numpy.ndarray, shape (n_pv) Explained variance of the source on each source principal vector. target_components_ : numpy.ndarray, shape (n_pv, n_features) Loadings of the target principal vectors ranked by similarity to the source. Components are in the row. target_explained_variance_ratio_: numpy.ndarray, shape (n_pv) Explained variance of the target on each target principal vector. cosine_similarity_matrix_: numpy.ndarray, shape (n_pv, n_pv) Scalar product between the source and the target principal vectors. Source principal vectors are in the rows while target's are in the columns. If the domain adaptation is sensible, a diagonal matrix should be obtained. """ def __init__(self, n_factors,n_pv, dim_reduction='pca', dim_reduction_target=None, project_on=0): """ Args: n_factors : int Number of domain-specific factors to extract from the data (e.g. using PCA, ICA). n_pv : int Number of principal vectors to find from the independently computed factors. dim_reduction : str, default to 'pca' Dimensionality reduction method for the source data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. dim_reduction_target : str, default to None Dimensionality reduction method for the target data, i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction. project_on: int or bool, default to 0 Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means both PVs. """ self.n_factors = n_factors self.n_pv = n_pv self.dim_reduction_method_source = dim_reduction self.dim_reduction_method_target = dim_reduction_target or dim_reduction self.dim_reduction_source = self._process_dim_reduction(self.dim_reduction_method_source) self.dim_reduction_target = self._process_dim_reduction(self.dim_reduction_method_target) self.source_components_ = None self.source_explained_variance_ratio_ = None self.target_components_ = None self.target_explained_variance_ratio_ = None self.cosine_similarity_matrix_ = None def _process_dim_reduction(self, dim_reduction): if type(dim_reduction) == str: return process_dim_reduction(method=dim_reduction, n_dim=self.n_factors) else: return dim_reduction
[docs] def fit(self, X_source, X_target, y_source=None): """ Compute the common factors between two set of data. IMPORTANT: Same genes have to be given for source and target, and in same order Args: X_source : np.ndarray, shape (n_components, n_genes) Source dataset. X_target : np.ndarray, shape (n_components, n_genes) Target dataset. y_source : np.ndarray, shape (n_components, 1) (optional, default to None) Eventual output, in case one wants to give ouput (for instance PLS). results: self: returns an instance of self. """ # Compute factors independently for source and target. Orthogonalize the basis Ps = self.dim_reduction_source.fit(X_source, y_source).components_ Ps = scipy.linalg.orth(Ps.transpose()).transpose() Pt = self.dim_reduction_target.fit(X_target, y_source).components_ Pt = scipy.linalg.orth(Pt.transpose()).transpose() # Compute the principal factors self.compute_principal_vectors(Ps, Pt) # Compute variance explained self.source_explained_variance_ratio_ = np.var(self.source_components_.dot(X_source.transpose()), axis=1)/\ np.sum(np.var(X_source), axis=0) self.target_explained_variance_ratio_ = np.var(self.target_components_.dot(X_target.transpose()), axis=1)/\ np.sum(np.var(X_target), axis=0) return self
[docs] def compute_principal_vectors(self, source_factors, target_factors): """Compute the principal vectors between the already computed set of domain-specific factors, using approach presented in [1,2]. IMPORTANT: Same genes have to be given for source and target, and in same order Args: source_factors: np.ndarray, shape (n_components, n_genes) Source domain-specific factors. target_factors: np.ndarray, shape (n_components, n_genes) Target domain-specific factors. results: self: returns an instance of self. """ # Find principal vectors using SVD u,sigma,v = np.linalg.svd(source_factors.dot(target_factors.transpose())) self.source_components_ = u.transpose().dot(source_factors)[:self.n_pv] self.target_components_ = v.dot(target_factors)[:self.n_pv] # Normalize to make sure that vectors are unitary self.source_components_ = normalize(self.source_components_, axis = 1) self.target_components_ = normalize(self.target_components_, axis = 1) # Compute cosine similarity matrix self.initial_cosine_similarity_matrix_ = source_factors.dot(target_factors.transpose()) self.cosine_similarity_matrix_ = self.source_components_.dot(self.target_components_.transpose()) # Compute angles self.angles_ = np.arccos(np.diag(self.cosine_similarity_matrix_)) return self
[docs] def transform(self, X, project_on=None): """ Projects data onto principal vectors. Args: X : numpy.ndarray, shape (n_samples, n_genes) Data to project. project_on: int or bool, default to None Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means both PVs. If None, set to class instance value. results: Projected data as a numpy.ndarray of shape (n_samples, n_factors). """ project_on = project_on or self.project_on # Project on source if project_on == 0: return X.dot(self.source_components_.transpose()) # Project on target elif project_on == -1: return X.dot(self.target_components_.transpose()) # Project on both elif project_on == 1: return X.dot(np.concatenate([self.source_components_.transpose(), self.target_components_.transpose()])) else: raise ValueError('project_on should be 0 (source), -1 (target) or 1 (both). %s not correct value'%(project_on))
[docs]def leave_one_out_validation(adata,sc_adata,genes:list): '''Function to validate the imputation of genes using SpaGe''' RNA_data=sc_adata.to_df() RNA_meta=sc_adata.obs spatial_data=adata.to_df() spatial_meta=adata.obs Correlations = pd.Series(index = genes) plt.style.use('dark_background') spatialcoords=pd.DataFrame(adata.obsm['spatial']) allimp=[] for i in genes: Imp_Genes = SpaGE(spatial_data.drop(i,axis=1),RNA_data,n_pv=30, genes_to_predict = [i]) Correlations[i] = st.spearmanr(spatial_data[i],Imp_Genes[i])[0] fig, (ax1, ax2) = plt.subplots(1, 2) ax1.axis('off') cmap = spatial_data[i] cmap[cmap > np.percentile(cmap,99)] = np.percentile(cmap,99) ax1.scatter(spatialcoords.iloc[:,0],spatialcoords.iloc[:,1],s=1,c=cmap) ax1.set_title('Measured ' + i, fontsize = 12) ax1.set_ylabel(i) ax2.axis('off') cmap = Imp_Genes[i] cmap[cmap > np.percentile(cmap,99)] = np.percentile(cmap,99) ax2.scatter(spatialcoords.iloc[:,0],spatialcoords.iloc[:,1],s=1,c=cmap) ax2.set_title('Predicted ' + i, fontsize = 12) allimp.append(Imp_Genes) return pd.concat(allimp)
[docs]def gene_imputation(adata,sc_adata,new_genes:list): '''Function to impute genes using SpaGe''' RNA_data=sc_adata.to_df() RNA_meta=sc_adata.obs spatial_data=adata.to_df() spatial_meta=adata.obs Correlations = pd.Series(index = new_genes) plt.style.use('dark_background') spatialcoords=pd.DataFrame(adata.obsm['spatial']) Imp_New_Genes = SpaGE(spatial_data,RNA_data,n_pv=30,genes_to_predict = new_genes) return Imp_New_Genes