Source code for xb.formatting

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import shutil
import os.path
from scipy.io import mmread
import tifffile as tf
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from anndata import AnnData
import json
import scipy as sp
from pathlib import Path
import tifffile
from xb.util import get_image_shape, extract_physical_sizes, convert_polygons_to_label_image_xenium


[docs]def format_xenium_adata(path,tag,output_path): """ Format xenium data (output from the machine) to adata format, using the original Xenium format (pre-release) Args: path(str): path to the folder where the output of the Xenium machine is stored. tag(str): sample tag to be added to be added to all cells formated from the section. output_path(str): path where to store the resulting adata object. results: adata: AnnData object with the formated cells """ #decompress if os.path.isfile(path+'/transcripts.csv')==False: with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in: with open(path+'/transcripts.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False: with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False: with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False: with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cells.csv')==False: with gzip.open(path+'/cells.csv.gz', 'rb') as f_in: with open(path+'/cells.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) a = mmread(path+'/cell_feature_matrix/matrix.mtx') ad=a.todense() cell_info=pd.read_csv(path+r"/cells.csv") features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0) barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0) adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features) adata.var.index.name='index' adata.var.columns=['gene_id','reason_of_inclusion'] panel_info=pd.read_csv(path+'/panel.tsv',sep='\t') adata.obsm['spatial']=np.array(adata.obs.loc[:,['x_centroid','y_centroid']]) try: panel_info['Gene'] except: panel_info['Gene']=panel_info['Name'] dict_annotation=dict(zip(panel_info['Gene'],panel_info['Annotation'])) dict_ENSEMBL=dict(zip(panel_info['Gene'],panel_info['Ensembl ID'])) adata.var['Annotation']=adata.var.index.map(dict_annotation) adata.var['Ensembl ID']=adata.var.index.map(dict_ENSEMBL) adata.var['in_panel']=adata.var.index.isin(panel_info['Gene']) transcripts=pd.read_csv(path+'/transcripts.csv',index_col=0) if os.path.isfile(path+'/background.tiff')==False: format_background(path) IM=tf.TiffFile(path+'/background.tiff') position1_series = IM.series[0] position1_series.axes position1 = position1_series.asarray() image_downsize_fact=1/(2000/np.max(position1.shape)) pos1_resized=np.resize(position1,(position1.shape/image_downsize_fact).astype(int)) adata.uns={"spatial":{tag:{"scalefactors":{"tissue_um_to_pixel":1/0.2125,"tissue_hires_scalef":1/(0.2125*image_downsize_fact)},"images":{"hires":pos1_resized}}}} adata.uns['spots']=transcripts try: UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0) adata.obsm['X_umap']=np.array(UMAP) TSNE=pd.read_csv(path+'/analysis/tsne/gene_expression_2_components/projection.csv',index_col=0) adata.obsm['X_tsne']=np.array(TSNE) PCA=pd.read_csv(path+'/analysis/PCA/gene_expression_10_components/projection.csv',index_col=0) adata.obsm['X_pca']=np.array(PCA) clusters=pd.read_csv(path+'/analysis/clustering/gene_expression_graphclust/clusters.csv',index_col=0) adata.obs['graph_clusters']=list(clusters['Cluster'].astype(str)) kmeans2=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans2_clusters']=list(kmeans2['Cluster'].astype(str)) kmeans3=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_3_clusters/clusters.csv',index_col=0) adata.obs['kmeans3_clusters']=list(kmeans3['Cluster'].astype(str)) kmeans4=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_4_clusters/clusters.csv',index_col=0) adata.obs['kmeans4_clusters']=list(kmeans4['Cluster'].astype(str)) kmeans5=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_5_clusters/clusters.csv',index_col=0) adata.obs['kmeans5_clusters']=list(kmeans5['Cluster'].astype(str)) kmeans6=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_6_clusters/clusters.csv',index_col=0) adata.obs['kmeans6_clusters']=list(kmeans6['Cluster'].astype(str)) kmeans7=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_7_clusters/clusters.csv',index_col=0) adata.obs['kmeans7_clusters']=list(kmeans7['Cluster'].astype(str)) kmeans8=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_8_clusters/clusters.csv',index_col=0) adata.obs['kmeans8_clusters']=list(kmeans8['Cluster'].astype(str)) kmeans9=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_9_clusters/clusters.csv',index_col=0) adata.obs['kmeans9_clusters']=list(kmeans9['Cluster'].astype(str)) kmeans10=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_10_clusters/clusters.csv',index_col=0) adata.obs['kmeans10_clusters']=list(kmeans10['Cluster'].astype(str)) except: print('UMAP and clusters_could not be recovered') adata.obs['cell_id']=adata.obs['cell_id'].astype(str) adata.write(output_path+tag+'.h5ad') return adata
[docs]def format_xenium_adata_2023(path,tag,output_path): """ Format xenium data (output from the machine) to adata format, considerin the format used by Xenium in Q1 2023 Args: path(str): path to the folder where the output of the Xenium machine is stored. tag(str): sample tag to be added to be added to all cells formated from the section. output_path(str): path where to store the resulting adata object. results: adata: AnnData object with the formated cells. """ #decompress if os.path.isfile(path+'/transcripts.csv')==False: with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in: with open(path+'/transcripts.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False: with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False: with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False: with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cells.csv')==False: with gzip.open(path+'/cells.csv.gz', 'rb') as f_in: with open(path+'/cells.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) a = mmread(path+'/cell_feature_matrix/matrix.mtx') ad=a.todense() cell_info=pd.read_csv(path+r"/cells.csv") features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0) barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0) adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features) adata.var.index.name='index' adata.var.columns=['gene_id','reason_of_inclusion'] # Opening JSON file f = open(path+'/gene_panel.json') # returns JSON object as # a dictionary data = json.load(f) # Iterating through the json # list geness=[] idss=[] descriptorss=[] for r in range(len(data['payload']['targets'])): geness.append(data['payload']['targets'][r]['type']['data']['name']) try: idss.append(data['payload']['targets'][r]['type']['data']['id']) except: idss.append('newid_'+str(r)) try: descriptorss.append(data['payload']['targets'][r]['type']['descriptor']) except: descriptorss.append('other') # Closing file f.close() dict_inpanel=dict(zip(geness,descriptorss)) dict_ENSEMBL=dict(zip(geness,idss)) adata.var['Ensembl ID']=adata.var['gene_id'].map(dict_ENSEMBL) adata.var['in_panel']=adata.var['gene_id'].map(dict_inpanel) transcripts=pd.read_csv(path+'/transcripts.csv',index_col=0) adata.uns['spots']=transcripts try: UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0) adata.obsm['X_umap']=np.array(UMAP) TSNE=pd.read_csv(path+'/analysis/tsne/gene_expression_2_components/projection.csv',index_col=0) adata.obsm['X_tsne']=np.array(TSNE) PCA=pd.read_csv(path+'/analysis/PCA/gene_expression_10_components/projection.csv',index_col=0) adata.obsm['X_pca']=np.array(PCA) clusters=pd.read_csv(path+'/analysis/clustering/gene_expression_graphclust/clusters.csv',index_col=0) adata.obs['graph_clusters']=list(clusters['Cluster'].astype(str)) kmeans2=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans2_clusters']=list(kmeans2['Cluster'].astype(str)) kmeans3=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans3_clusters']=list(kmeans3['Cluster'].astype(str)) kmeans4=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans4_clusters']=list(kmeans4['Cluster'].astype(str)) kmeans5=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans5_clusters']=list(kmeans5['Cluster'].astype(str)) kmeans6=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans6_clusters']=list(kmeans6['Cluster'].astype(str)) kmeans7=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans7_clusters']=list(kmeans7['Cluster'].astype(str)) kmeans8=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans8_clusters']=list(kmeans8['Cluster'].astype(str)) kmeans9=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans9_clusters']=list(kmeans9['Cluster'].astype(str)) kmeans10=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans10_clusters']=list(kmeans10['Cluster'].astype(str)) except: print('UMAP and clusters_could not be recovered') adata.X=sp.sparse.csr_matrix(adata.X) adata.write(output_path+tag+'.h5ad') return adata
[docs]def format_xenium_adata_mid_2023(path,tag,output_path): """ Format xenium data (output from the machine) to adata format, considerin the format used by Xenium at Q2 2023 Args: path(str): path to the folder where the output of the Xenium machine is stored. tag(str): sample tag to be added to be added to all cells formated from the section. output_path(str): path where to store the resulting adata object. results: adata: AnnData object with the formated cells. """ if os.path.isdir(path+'/cell_feature_matrix')==False: # Path of the file extr=path+'/cell_feature_matrix.tar.gz' # Target directory extract_dir = path # Unzip the file shutil.unpack_archive(extr, extract_dir) print('First decompressing done') if os.path.isfile(path+'/transcripts.csv')==False: with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in: with open(path+'/transcripts.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False: with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False: with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False: with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cells.csv')==False: with gzip.open(path+'/cells.csv.gz', 'rb') as f_in: with open(path+'/cells.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) a = mmread(path+'/cell_feature_matrix/matrix.mtx') ad=a.todense() cell_info=pd.read_csv(path+r"/cells.csv") features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0) barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0) adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features) adata.var.index.name='index' adata.var.columns=['gene_id','reason_of_inclusion'] f = open(path+'/gene_panel.json') data = json.load(f) geness=[] idss=[] descriptorss=[] for r in range(len(data['payload']['targets'])): geness.append(data['payload']['targets'][r]['type']['data']['name']) try: idss.append(data['payload']['targets'][r]['type']['data']['id']) except: idss.append('newid_'+str(r)) try: descriptorss.append(data['payload']['targets'][r]['type']['descriptor']) except: descriptorss.append('other') # Closing file f.close() dict_inpanel=dict(zip(geness,descriptorss)) dict_ENSEMBL=dict(zip(geness,idss)) adata.var['Ensembl ID']=adata.var['gene_id'].map(dict_ENSEMBL) adata.var['in_panel']=adata.var['gene_id'].map(dict_inpanel) transcripts=pd.read_csv(path+'/transcripts.csv',index_col=0) adata.uns['spots']=transcripts if os.path.isdir(path+'/analysis')==False: # Path of the file extr=path+'/analysis.tar.gz' # Target directory extract_dir = path # Unzip the file shutil.unpack_archive(extr, extract_dir) print('Analysis files decompressed') try: UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0) adata.obsm['X_umap']=np.array(UMAP) TSNE=pd.read_csv(path+'/analysis/tsne/gene_expression_2_components/projection.csv',index_col=0) adata.obsm['X_tsne']=np.array(TSNE) PCA=pd.read_csv(path+'/analysis/PCA/gene_expression_10_components/projection.csv',index_col=0) adata.obsm['X_pca']=np.array(PCA) clusters=pd.read_csv(path+'/analysis/clustering/gene_expression_graphclust/clusters.csv',index_col=0) adata.obs['graph_clusters']=list(clusters['Cluster'].astype(str)) kmeans2=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans2_clusters']=list(kmeans2['Cluster'].astype(str)) kmeans3=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans3_clusters']=list(kmeans3['Cluster'].astype(str)) kmeans4=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans4_clusters']=list(kmeans4['Cluster'].astype(str)) kmeans5=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans5_clusters']=list(kmeans5['Cluster'].astype(str)) kmeans6=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans6_clusters']=list(kmeans6['Cluster'].astype(str)) kmeans7=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans7_clusters']=list(kmeans7['Cluster'].astype(str)) kmeans8=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans8_clusters']=list(kmeans8['Cluster'].astype(str)) kmeans9=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans9_clusters']=list(kmeans9['Cluster'].astype(str)) kmeans10=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0) adata.obs['kmeans10_clusters']=list(kmeans10['Cluster'].astype(str)) except: print('UMAP and clusters_could not be recovered') adata.X=sp.sparse.csr_matrix(adata.X) adata.uns['spots']=adata.uns['spots'].fillna(0) adata.uns['spots']['fov_name']=adata.uns['spots']['fov_name'].astype(str) adata.write(output_path+tag+'.h5ad') return adata
[docs]def format_background(path): """ Format OME-TIFF background mipped image to .tiff image Args: path(str): path to the folder where the output of the Xenium machine is stored. results: None """ IM=tf.TiffFile(path+'/morphology_mip.ome.tif') position1_series = IM.series[0] position1_series.axes position1 = position1_series.asarray() tf.imwrite(path+'/background.tiff',position1)
[docs]def keep_nuclei(adata1,overlaps_nucleus=1): """ Redefine cells in AnnData to keep only nuclear reads Args: adata1(AnnData): AnnData object with the cells of the experiment. overlaps_nucleus(int): whether to keep only nuclear reads only (1) or cytoplasmic reads (0) in the redefinition of cells. results: adata: AnnData object with the formated cells """ subset1=adata1.uns['spots'].loc[adata1.uns['spots']['overlaps_nucleus']==overlaps_nucleus,:] ct1=pd.crosstab(subset1['cell_id'],subset1['feature_name']) adataobs=adata1.obs.loc[adata1.obs['cell_id'].isin(ct1.index),:] ct1=ct1.loc[:,adata1.var.index] adataobs.index=adataobs['cell_id'] adataobs.index.name='ind' ct1=ct1.loc[ct1.index.isin(adataobs['cell_id']),:] adata1nuc=sc.AnnData(np.array(ct1),obs=adataobs,var=adata1.var) return adata1nuc
[docs]def cell_area(adata_sp: AnnData,pipeline_output=True): """Calculates the area of the region imaged using convex hull and divide total number of cells/area. XY position should be in um2 Args: adata_sp : AnnData, annotated ``AnnData`` object with counts from spatial data. pipeline_output : float, optional, boolean for whether to create the pipeline output. results: density : float Cell density (cells/um) """ hull = ConvexHull(np.array(adata_sp.uns['spots'].loc[:,['x','y']])) area=hull.area#*10e6 return area
[docs]def generate_random_color_variation(base_color, deviation=0.17): """ Generate variations of a reference color Args: base_color (str):reference hex color. deviation(float): deviation from the base color that the resulting color should have. results: modified_hex_color(str):resulting hex color. """ base_rgb = mcolors.hex2color(base_color) h, s, v = mcolors.rgb_to_hsv(base_rgb) # Make random adjustments to the hue, saturation, and value h += random.uniform(-deviation, deviation) s = max(0, min(1, s + random.uniform(-deviation, deviation))) v = max(0, min(1, v + random.uniform(-deviation, deviation))) # Ensure the values are within the valid HSV range h = h % 1.0 modified_rgb = mcolors.hsv_to_rgb((h, s, v)) modified_hex_color = mcolors.to_hex(modified_rgb) return modified_hex_color
[docs]def format_data_neighs(adata,sname,condit,neighs=10): """ Redefine the expression of cells in adata by counting the neighnoring cell types of each cell Args: adata (AnnData): AnnData object with the cells of the experiment. sname(str): column in adata.obs where the cluster assigned to each cells are stored. neighs(int): number of neighbors to consider when computing neighboring cells. results: adata1 (AnnData): AnnData object with neighboring cell types included in a cell-by-celltype matrix. """ try: adata.obsm['spatial'] except: adata.obsm["spatial"]=np.array([adata.obs.X,adata.obs.Y]).transpose().astype('float64') adata_copy_int=adata sq.gr.spatial_neighbors(adata_copy_int,n_neighs=neighs) result=np.zeros([adata.shape[0],len(adata_copy_int.obs[sname].unique())]) n=0 tr=adata_copy_int.obsp['spatial_distances'].transpose() tr2=tr>0 from tqdm import tqdm for g in tqdm(adata_copy_int.obs[sname].unique()): epv=adata_copy_int.obs[sname]==g*1 opv=list(epv*1) result[:,n]=tr2.dot(opv) n=n+1 expmat=pd.DataFrame(result,columns=adata_copy_int.obs[sname].unique()) adata1=sc.AnnData(expmat,obs=adata.obs) #adata1.obs['sample']=condit adata1.obs['condition']=condit return adata1
[docs]def format_data_neighs_colapse(adata,sname,condit,neighs=10): """ Redefine the expression of cells in adata by collapsing the expression of its neighbors into each cell (a.k.a pseudobining) Args: adata (AnnData): AnnData object with the cells of the experiment. sname(str): column in adata.obs where sample is stored. condit(str): column in adata.obs where the sample each cell belongs to is stored. neighs(int): number of neighbors to consider when collapsing the expression of neighboring cells. results: adata1 (AnnData): AnnData object with expression of cells collapsed from neighboring cells. """ adata.obsm["spatial"]=np.array([adata.obs.X,adata.obs.Y]).transpose().astype('float64') adata_copy_int=adata sq.gr.spatial_neighbors(adata_copy_int,n_neighs=neighs) result=np.zeros([adata.shape[0],adata.shape[1]]) n=0 tr=adata_copy_int.obsp['spatial_distances'].transpose() tr2=tr>0 exp=adata_copy_int.to_df() from tqdm import tqdm #tdd=tr2.todense() for i in tqdm(range(0,adata_copy_int.to_df().shape[0])): result[i,:]=np.sum(exp[tr2[i,:].todense().transpose()],axis=0) adata1=sc.AnnData(result,obs=adata.obs,var=adata.var) return adata1
[docs]def format_xenium_adata_final(path,tag,output_path,use_parquet=True,save=True): """ Format xenium data (output from the machine) to adata format using the official up-to-date Xenium format Args: path(str): path to the folder where the output of the Xenium machine is stored, if requested. tag(str): sample tag to be added to be added to all cells formated from the section. output_path(str): path where to store the resulting adata object. use_parquet(boolean): whether to use parquet files as an input to generate the AnnData File. (it's way faster). save(boolean): whether to save the resulting object. results: adata: AnnData object with the formated cells """ if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False: with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False: with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False: with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in: with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if os.path.isfile(path+'/cells.csv')==False: with gzip.open(path+'/cells.csv.gz', 'rb') as f_in: with open(path+'/cells.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) a = mmread(path+'/cell_feature_matrix/matrix.mtx') ad=a.todense() cell_info=pd.read_csv(path+r"/cells.csv") features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0) barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0) adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features) adata.var.index.name='index' adata.var.columns=['gene_id','reason_of_inclusion'] f = open(path+'/gene_panel.json') data = json.load(f) geness=[] idss=[] descriptorss=[] for r in range(len(data['payload']['targets'])): geness.append(data['payload']['targets'][r]['type']['data']['name']) try: idss.append(data['payload']['targets'][r]['type']['data']['id']) except: idss.append('newid_'+str(r)) try: descriptorss.append(data['payload']['targets'][r]['type']['descriptor']) except: descriptorss.append('other') f.close() dict_inpanel=dict(zip(geness,descriptorss)) dict_ENSEMBL=dict(zip(geness,idss)) adata.var['Ensembl ID']=adata.var['gene_id'].map(dict_ENSEMBL) adata.var['in_panel']=adata.var['gene_id'].map(dict_inpanel) if use_parquet==True: transcripts=pd.read_parquet(path+'/transcripts.parquet') else: if os.path.isfile(path+'/transcripts.csv')==False: with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in: with open(path+'/transcripts.csv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) transcripts=pd.read_csv(path+'/transcripts.csv') adata.uns['spots']=transcripts UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0) id2UMAP1=dict(zip(UMAP.index,UMAP.iloc[:,0])) id2UMAP2=dict(zip(UMAP.index,UMAP.iloc[:,1])) adata.obsm['X_umap_original']=np.array([adata.obs['cell_id'].map(id2UMAP1),adata.obs['cell_id'].map(id2UMAP2)]).transpose() PCA=pd.read_csv(path+'/analysis/pca/gene_expression_10_components/projection.csv',index_col=0) id2PCA1=dict(zip(PCA.index,PCA.iloc[:,0])) id2PCA2=dict(zip(PCA.index,PCA.iloc[:,1])) adata.obsm['X_pca_10_comp']=np.array([adata.obs['cell_id'].map(id2PCA1),adata.obs['cell_id'].map(id2PCA2)]).transpose() dici={'/analysis/clustering/gene_expression_graphclust/clusters.csv':'graph_clusters', '/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv':'kmeans2', '/analysis/clustering/gene_expression_kmeans_3_clusters/clusters.csv':'kmeans3', '/analysis/clustering/gene_expression_kmeans_4_clusters/clusters.csv':'kmeans4', '/analysis/clustering/gene_expression_kmeans_5_clusters/clusters.csv':'kmeans5', '/analysis/clustering/gene_expression_kmeans_6_clusters/clusters.csv':'kmeans6', '/analysis/clustering/gene_expression_kmeans_7_clusters/clusters.csv':'kmeans7', '/analysis/clustering/gene_expression_kmeans_8_clusters/clusters.csv':'kmeans8', '/analysis/clustering/gene_expression_kmeans_9_clusters/clusters.csv':'kmeans9', '/analysis/clustering/gene_expression_kmeans_10_clusters/clusters.csv':'kmeans10' } for subpath in dici.keys(): clust=pd.read_csv(path+subpath,index_col=0) clustdict=dict(zip(clust.index,clust.iloc[:,0])) adata.obs[dici[subpath]]=adata.obs['cell_id'].map(clustdict).astype(str) adata.X=sp.sparse.csr_matrix(adata.X) adata.uns['spots']=adata.uns['spots'].fillna(0) adata.uns['spots']['fov_name']=adata.uns['spots']['fov_name'].astype(str) adata.obsm['spatial']=np.array([adata.obs['x_centroid'],adata.obs['y_centroid']]).transpose() adata.obs['sample']=tag if save==True: adata.write(output_path+tag+'_original.h5ad') return adata
[docs]def keep_nuclei_and_quality(adata1,tag:str,max_nucleus_distance=1,min_quality=20,save=True,output_path=''): """ Redefine cell expression based on nuclei expression an quality of detected reads Args: adata1 (AnnData): AnnData object with the cells of the experiment before filtereing reads based on quality or nuclear/non-nuclear. tag (str): sample tag to added in the name of the saved filed, if needed. save(boolean): whether to save the resulting files. output_path(str): if needed, where to save the resulting files. max_nucleus_distance(float): Maximum distance from the nuclei for reads to be kept in redefined cells. min_quality(float): Define minimum quality (qv) of reads to keep in the analysis. results: adata1nuc(AnnData): AnnData object with the cells redefined based to input parameters. """ if max_nucleus_distance==0: subset1=adata1.uns['spots'].loc[adata1.uns['spots']['overlaps_nucleus']==overlaps_nucleus,:] if max_nucleus_distance>0: subset1=adata1.uns['spots'].loc[adata1.uns['spots']['nucleus_distance']<max_nucleus_distance,:] subset1=subset1[subset1['qv']>min_quality] ct1=pd.crosstab(subset1['cell_id'],subset1['feature_name']) adataobs=adata1.obs.loc[adata1.obs['cell_id'].isin(ct1.index),:] av=adata1.var[adata1.var['gene_id'].isin(ct1.columns)]#.isin(adata1.var.index) ct1=ct1.loc[:,av['gene_id']] adataobs.index=adataobs['cell_id'] adataobs.index.name='ind' ct1=ct1.loc[ct1.index.isin(adataobs['cell_id']),:] adata1nuc=sc.AnnData(np.array(ct1),obs=adataobs,var=av) if save==True: adata1nuc.write(output_path+tag+'_filtered.h5ad') adata1nuc.obsm['spatial']=np.array([adata1nuc.obs['x_centroid'],adata1nuc.obs['y_centroid']]).transpose() return adata1nuc
[docs]def format_to_adata(files:list,output_path:str,use_parquet=True,save=False,max_nucleus_distance=0,min_quality=10): """ Format xenium datasets (outputs from the machine, up to date 2024) to adata files and filter reads based on quality parameters Args: files(list): list including the paths where the Xenium outputs are saved for each sample (output from the machine). output_path(str): path where to store the resulting adata object. use_parquet(boolean): whether to use parquet files as an input to generate the AnnData File. (it's way faster). save(boolean): whether to save the resulting object. max_nucleus_distance: Maximum distance from the nuclei for reads to be kept in redefined cells. min_quality(float): Define minimum quality (qv) of reads to keep in the analysis. results: adata: AnnData object with the formated cells with only reads that passed the filters established. """ if not os.path.exists(output_path): os.mkdir(output_path) alladata=[] for f in files: tag=f.split('/')[-1] print(f'Formatting {tag}') adata=format_xenium_adata_final(f,tag,output_path,use_parquet=use_parquet,save=save) print('Filter reads') adata_f1=keep_nuclei_and_quality(adata,tag=tag,max_nucleus_distance=max_nucleus_distance,min_quality=min_quality,save=save,output_path=output_path) #xf.format_background(f) alladata.append(adata_f1) adata=sc.concat(alladata,join='outer') adata.var=alladata[0].var adata.obs['total_counts']=np.sum(adata.X,axis=1) adata.obs['expressed_genes']=np.sum(adata.X>0,axis=1) adata.obs['unique_cell_id']=adata.obs['cell_id'].astype(str)+'_'+adata.obs['sample'].astype(str) adata.var['ENSMBL_ID']=adata.var.index adata.var.index=adata.var['gene_id'] if save==True: adata.write(output_path+'combined_filtered.h5ad') return adata
[docs]def prep_xenium_data_for_baysor(XENIUM_DIR:str, OUT_DIR:str,CROP=True, COORDS=[15000, 16000, 15000, 16000]): """ Format xenium datasets for its use for baysor segmentation Args: XENIUM_DIR(list): path where the Xenium output is saved for each sample (output from the machine). OUT_DIR(str): path where to store the resulting adata object. CROP(boolean): whether to use a small Region of interest for segmentation. COORDS(list): if CROP is used, coordinates of the crop in the form of [YMIN,YMAX,XMIN,XMAX]. results: None. """ OUT_DIR=Path(str(OUT_DIR)+'/'+str(XENIUM_DIR.split('/')[-1:][0])+'_baysor') XENIUM_DIR=Path(XENIUM_DIR) # Get shape of full image if not CROP: YMAX, XMAX = get_image_shape(XENIUM_DIR / "morphology.ome.tif") YMIN, XMIN = 0, 0 # Or define a crop else: YMIN=COORDS[0] YMAX=COORDS[1] XMIN=COORDS[2] XMAX=COORDS[3] #------------------------------------------------------------- # Create output directory OUT_DIR.mkdir(exist_ok=True) print("CHECKPOINT 1: Output directory created") # Load spots spots = pd.read_parquet(XENIUM_DIR / "transcripts.parquet") print("CHECKPOINT 2: Spots loaded") # Convert physical units of spots to pixel units phys_sizes = extract_physical_sizes(XENIUM_DIR / "morphology.ome.tif") phys_sizes = phys_sizes[list(phys_sizes.keys())[0]] phys_sizes = {k:float(v) for k,v in phys_sizes.items()} assert phys_sizes["PhysicalSizeX"] == phys_sizes["PhysicalSizeY"] resolution = phys_sizes["PhysicalSizeX"] spots["x_location"] /= resolution spots["y_location"] /= resolution # Note: we scale z µm the same way as x and y because the physical length for z pixels is larger than for xy pixels # and Baysor assumes euclidean distances in space. if ("z_location" in spots): spots["z_location"] /= resolution print("CHECKPOINT 3: Spots converted to pixel units") if CROP: # Subset spots to crop spots = spots.loc[ (spots["y_location"] >= YMIN) & (spots["y_location"] < YMAX) & (spots["x_location"] >= XMIN) & (spots["x_location"] < XMAX) ] # Offset positions if YMIN, XMIN are not 0 spots["x_location"] -= XMIN spots["y_location"] -= YMIN print("CHECKPOINT 4: Spots cropped") # Load polygons df_nuc = pd.read_parquet(XENIUM_DIR / "nucleus_boundaries.parquet") print("CHECKPOINT 5: Polygons loaded") # Convert physical units of polygons to pixel units df_nuc["vertex_x"] /= resolution df_nuc["vertex_y"] /= resolution if CROP: # Subset polygons to crop, NOTE: Polygons at border will be cut off df_nuc = df_nuc.loc[ (df_nuc["vertex_y"] >= YMIN) & (df_nuc["vertex_y"] < YMAX) & (df_nuc["vertex_x"] >= XMIN) & (df_nuc["vertex_x"] < XMAX) ] # Offset positions if YMIN, XMIN are not 0 df_nuc["vertex_x"] -= XMIN df_nuc["vertex_y"] -= YMIN print("CHECKPOINT 6: Polygons cropped") # Create label image series = pd.Series(df_nuc['cell_id']) unique_numbers, _ = pd.factorize(series, sort=True) df_nuc['label_id']=unique_numbers+1 label_image = convert_polygons_to_label_image_xenium(df_nuc, (YMAX-YMIN, XMAX-XMIN),label_col='label_id') print("CHECKPOINT 7: Label image created") # Save spots, polygons and label image spots.to_csv(OUT_DIR / "spots.csv", index=False) print("CHECKPOINT 8: Spots saved") df_nuc.to_csv(OUT_DIR / "polygons.csv", index=False) print("CHECKPOINT 9: Polygons saved") tifffile.imwrite(OUT_DIR / "label_image.tif", label_image) print("CHECKPOINT 10: Label image saved")
[docs]def batch_prep_xenium_data_for_baysor(files,outpath,CROP=True, COORDS=[1000, 5000, 1000, 5000]): """ Running the function prep_xenium_data_for_baysor for multiple samples Args: files(list): list including the paths where the Xenium outputs are saved for each sample (output from the machine). outpath(str): path where to store the resulting adata object. CROP(boolean): whether to use a small Region of interest for segmentation. COORDS(list): if CROP is used, coordinates of the crop in the form of [YMIN,YMAX,XMIN,XMAX]. results: None. """ for f in files: prep_xenium_data_for_baysor(f, output_path,CROP=False)
[docs]def format_baysor_output_to_adata(path:str,output_path:str): """Format baysor's output to anndata Args: path (AnnData): path to the folder where baysor's output is stored output_path(str): path where to store the generated adata results: adata (AnnData): AnnData object with the cells of the experiment """ read_info=pd.read_csv(path+'/segmentation.csv') cell_stats=pd.read_csv(path+'/segmentation_cell_stats.csv') read_info['cell_id']=read_info['cell'] expression=pd.crosstab(read_info['cell'],read_info['gene']) cell_stats=cell_stats.loc[cell_stats['cell'].isin(expression.index),:] expression=expression.loc[expression.index.isin(cell_stats['cell']),:] expression=expression.loc[cell_stats['cell'],:] adata=sc.AnnData(expression) adata.obs=cell_stats adata.uns['spots']=read_info adata.obsm['spatial']=np.array(adata.obs.loc[:,['x','y']]) adata.obs['sample']=str(path.split('/')[-1:][0]) adata.uns['spots']['cell_id']=adata.uns['spots']['cell_id'].astype(str) adata.uns['spots']['cell']=adata.uns['spots']['cell'].astype(str) adata.write(output_path+str(path.split('/')[-1:][0])+'.h5ad') return adata