Source code for xb.formatting

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import shutil
import os.path
from scipy.io import mmread
import tifffile as tf
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from anndata import AnnData
import json
import scipy as sp
from pathlib import Path
import tifffile
from xb.util import get_image_shape, extract_physical_sizes, convert_polygons_to_label_image_xenium


[docs]def format_xenium_adata(path,tag,output_path):
    """ Format xenium data (output from the machine) to adata format, using the original Xenium format (pre-release)  

    Args:
        path(str): path to the folder where the output of the Xenium machine is stored.

        tag(str): sample tag to be added to be added to all cells formated from the section. 

        output_path(str): path where to store the resulting adata object.

    results:
        adata: AnnData object with the formated cells
    """

    #decompress
    if os.path.isfile(path+'/transcripts.csv')==False:
        with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in:
            with open(path+'/transcripts.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False:
        with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cells.csv')==False:
        with gzip.open(path+'/cells.csv.gz', 'rb') as f_in:
            with open(path+'/cells.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    a = mmread(path+'/cell_feature_matrix/matrix.mtx')
    ad=a.todense()
    cell_info=pd.read_csv(path+r"/cells.csv")
    features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0)
    barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0)
    adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features)
    adata.var.index.name='index'
    adata.var.columns=['gene_id','reason_of_inclusion']
    panel_info=pd.read_csv(path+'/panel.tsv',sep='\t')
    adata.obsm['spatial']=np.array(adata.obs.loc[:,['x_centroid','y_centroid']])
    try:
        panel_info['Gene']
    except:
        panel_info['Gene']=panel_info['Name']
    dict_annotation=dict(zip(panel_info['Gene'],panel_info['Annotation']))
    dict_ENSEMBL=dict(zip(panel_info['Gene'],panel_info['Ensembl ID']))
    adata.var['Annotation']=adata.var.index.map(dict_annotation)
    adata.var['Ensembl ID']=adata.var.index.map(dict_ENSEMBL)
    adata.var['in_panel']=adata.var.index.isin(panel_info['Gene'])
    transcripts=pd.read_csv(path+'/transcripts.csv',index_col=0)
    if os.path.isfile(path+'/background.tiff')==False:
        format_background(path)
    IM=tf.TiffFile(path+'/background.tiff')
    position1_series = IM.series[0]
    position1_series.axes
    position1 = position1_series.asarray()
    image_downsize_fact=1/(2000/np.max(position1.shape))
    pos1_resized=np.resize(position1,(position1.shape/image_downsize_fact).astype(int))
    adata.uns={"spatial":{tag:{"scalefactors":{"tissue_um_to_pixel":1/0.2125,"tissue_hires_scalef":1/(0.2125*image_downsize_fact)},"images":{"hires":pos1_resized}}}} 
    adata.uns['spots']=transcripts
    try:
        UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0)
        adata.obsm['X_umap']=np.array(UMAP)
        TSNE=pd.read_csv(path+'/analysis/tsne/gene_expression_2_components/projection.csv',index_col=0)
        adata.obsm['X_tsne']=np.array(TSNE)
        PCA=pd.read_csv(path+'/analysis/PCA/gene_expression_10_components/projection.csv',index_col=0)
        adata.obsm['X_pca']=np.array(PCA)
        clusters=pd.read_csv(path+'/analysis/clustering/gene_expression_graphclust/clusters.csv',index_col=0)
        adata.obs['graph_clusters']=list(clusters['Cluster'].astype(str))
        kmeans2=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans2_clusters']=list(kmeans2['Cluster'].astype(str))
        kmeans3=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_3_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans3_clusters']=list(kmeans3['Cluster'].astype(str))
        kmeans4=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_4_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans4_clusters']=list(kmeans4['Cluster'].astype(str))
        kmeans5=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_5_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans5_clusters']=list(kmeans5['Cluster'].astype(str))
        kmeans6=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_6_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans6_clusters']=list(kmeans6['Cluster'].astype(str))
        kmeans7=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_7_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans7_clusters']=list(kmeans7['Cluster'].astype(str))
        kmeans8=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_8_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans8_clusters']=list(kmeans8['Cluster'].astype(str))
        kmeans9=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_9_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans9_clusters']=list(kmeans9['Cluster'].astype(str))
        kmeans10=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_10_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans10_clusters']=list(kmeans10['Cluster'].astype(str))
    except:
        print('UMAP and clusters_could not be recovered')
    adata.obs['cell_id']=adata.obs['cell_id'].astype(str)
    adata.write(output_path+tag+'.h5ad')
    return adata
    

[docs]def format_xenium_adata_2023(path,tag,output_path):
    """ Format xenium data (output from the machine) to adata format, considerin the format used by Xenium in Q1 2023  

    Args:
        path(str): path to the folder where the output of the Xenium machine is stored.

        tag(str): sample tag to be added to be added to all cells formated from the section. 

        output_path(str): path where to store the resulting adata object.

    results:
        adata: AnnData object with the formated cells.
    """

    #decompress
    if os.path.isfile(path+'/transcripts.csv')==False:
        with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in:
            with open(path+'/transcripts.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False:
        with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cells.csv')==False:
        with gzip.open(path+'/cells.csv.gz', 'rb') as f_in:
            with open(path+'/cells.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    a = mmread(path+'/cell_feature_matrix/matrix.mtx')
    ad=a.todense()
    cell_info=pd.read_csv(path+r"/cells.csv")
    features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0)
    barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0)
    adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features)
    adata.var.index.name='index'
    adata.var.columns=['gene_id','reason_of_inclusion']
    # Opening JSON file
    f = open(path+'/gene_panel.json')
    # returns JSON object as 
    # a dictionary
    data = json.load(f)
    # Iterating through the json
    # list
    geness=[]
    idss=[]
    descriptorss=[]
    for r in range(len(data['payload']['targets'])):
        geness.append(data['payload']['targets'][r]['type']['data']['name'])
        try:
            idss.append(data['payload']['targets'][r]['type']['data']['id'])
        except:
            idss.append('newid_'+str(r)) 
        try:
            descriptorss.append(data['payload']['targets'][r]['type']['descriptor'])
        except:
            descriptorss.append('other')
    # Closing file
    f.close()

    dict_inpanel=dict(zip(geness,descriptorss))
    dict_ENSEMBL=dict(zip(geness,idss))
    adata.var['Ensembl ID']=adata.var['gene_id'].map(dict_ENSEMBL)
    adata.var['in_panel']=adata.var['gene_id'].map(dict_inpanel)
    transcripts=pd.read_csv(path+'/transcripts.csv',index_col=0)
    adata.uns['spots']=transcripts
    try:
        UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0)
        adata.obsm['X_umap']=np.array(UMAP)
        TSNE=pd.read_csv(path+'/analysis/tsne/gene_expression_2_components/projection.csv',index_col=0)
        adata.obsm['X_tsne']=np.array(TSNE)
        PCA=pd.read_csv(path+'/analysis/PCA/gene_expression_10_components/projection.csv',index_col=0)
        adata.obsm['X_pca']=np.array(PCA)
        clusters=pd.read_csv(path+'/analysis/clustering/gene_expression_graphclust/clusters.csv',index_col=0)
        adata.obs['graph_clusters']=list(clusters['Cluster'].astype(str))
        kmeans2=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans2_clusters']=list(kmeans2['Cluster'].astype(str))
        kmeans3=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans3_clusters']=list(kmeans3['Cluster'].astype(str))
        kmeans4=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans4_clusters']=list(kmeans4['Cluster'].astype(str))
        kmeans5=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans5_clusters']=list(kmeans5['Cluster'].astype(str))
        kmeans6=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans6_clusters']=list(kmeans6['Cluster'].astype(str))
        kmeans7=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans7_clusters']=list(kmeans7['Cluster'].astype(str))
        kmeans8=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans8_clusters']=list(kmeans8['Cluster'].astype(str))
        kmeans9=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans9_clusters']=list(kmeans9['Cluster'].astype(str))
        kmeans10=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans10_clusters']=list(kmeans10['Cluster'].astype(str))
    except:
        print('UMAP and clusters_could not be recovered')
    adata.X=sp.sparse.csr_matrix(adata.X)
    adata.write(output_path+tag+'.h5ad')
    return adata


[docs]def format_xenium_adata_mid_2023(path,tag,output_path):
    """ Format xenium data (output from the machine) to adata format, considerin the format used by Xenium at Q2 2023  

    Args:
        path(str): path to the folder where the output of the Xenium machine is stored.

        tag(str): sample tag to be added to be added to all cells formated from the section. 

        output_path(str): path where to store the resulting adata object.

    results:
        adata: AnnData object with the formated cells.
    """

    if os.path.isdir(path+'/cell_feature_matrix')==False:
    # Path of the file
        extr=path+'/cell_feature_matrix.tar.gz'
        # Target directory
        extract_dir = path
        # Unzip the file 
        shutil.unpack_archive(extr, extract_dir)
        print('First decompressing done')
    
    if os.path.isfile(path+'/transcripts.csv')==False:
        with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in:
            with open(path+'/transcripts.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False:
        with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cells.csv')==False:
        with gzip.open(path+'/cells.csv.gz', 'rb') as f_in:
            with open(path+'/cells.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    a = mmread(path+'/cell_feature_matrix/matrix.mtx')
    ad=a.todense()
    cell_info=pd.read_csv(path+r"/cells.csv")
    features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0)
    barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0)
    adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features)
    adata.var.index.name='index'
    adata.var.columns=['gene_id','reason_of_inclusion']
    f = open(path+'/gene_panel.json') 
    data = json.load(f)
    geness=[]
    idss=[]
    descriptorss=[]
    for r in range(len(data['payload']['targets'])):
        geness.append(data['payload']['targets'][r]['type']['data']['name'])
        try:
            idss.append(data['payload']['targets'][r]['type']['data']['id'])
        except:
            idss.append('newid_'+str(r)) 
        try:
            descriptorss.append(data['payload']['targets'][r]['type']['descriptor'])
        except:
            descriptorss.append('other')
    # Closing file
    f.close()

    dict_inpanel=dict(zip(geness,descriptorss))
    dict_ENSEMBL=dict(zip(geness,idss))
    adata.var['Ensembl ID']=adata.var['gene_id'].map(dict_ENSEMBL)
    adata.var['in_panel']=adata.var['gene_id'].map(dict_inpanel)
    transcripts=pd.read_csv(path+'/transcripts.csv',index_col=0)
    adata.uns['spots']=transcripts
    if os.path.isdir(path+'/analysis')==False:
    # Path of the file
        extr=path+'/analysis.tar.gz'
        # Target directory
        extract_dir = path
        # Unzip the file 
        shutil.unpack_archive(extr, extract_dir)
        print('Analysis files decompressed')
    try:
        UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0)
        adata.obsm['X_umap']=np.array(UMAP)
        TSNE=pd.read_csv(path+'/analysis/tsne/gene_expression_2_components/projection.csv',index_col=0)
        adata.obsm['X_tsne']=np.array(TSNE)
        PCA=pd.read_csv(path+'/analysis/PCA/gene_expression_10_components/projection.csv',index_col=0)
        adata.obsm['X_pca']=np.array(PCA)
        clusters=pd.read_csv(path+'/analysis/clustering/gene_expression_graphclust/clusters.csv',index_col=0)
        adata.obs['graph_clusters']=list(clusters['Cluster'].astype(str))
        kmeans2=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans2_clusters']=list(kmeans2['Cluster'].astype(str))
        kmeans3=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans3_clusters']=list(kmeans3['Cluster'].astype(str))
        kmeans4=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans4_clusters']=list(kmeans4['Cluster'].astype(str))
        kmeans5=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans5_clusters']=list(kmeans5['Cluster'].astype(str))
        kmeans6=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans6_clusters']=list(kmeans6['Cluster'].astype(str))
        kmeans7=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans7_clusters']=list(kmeans7['Cluster'].astype(str))
        kmeans8=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans8_clusters']=list(kmeans8['Cluster'].astype(str))
        kmeans9=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans9_clusters']=list(kmeans9['Cluster'].astype(str))
        kmeans10=pd.read_csv(path+'/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv',index_col=0)
        adata.obs['kmeans10_clusters']=list(kmeans10['Cluster'].astype(str))
    except:
        print('UMAP and clusters_could not be recovered')
    adata.X=sp.sparse.csr_matrix(adata.X)
    adata.uns['spots']=adata.uns['spots'].fillna(0)
    adata.uns['spots']['fov_name']=adata.uns['spots']['fov_name'].astype(str)
    adata.write(output_path+tag+'.h5ad')
    return adata


[docs]def format_background(path):
    """ Format OME-TIFF background mipped image to .tiff image

    Args:
        path(str): path to the folder where the output of the Xenium machine is stored.

    results:
        None
    """
    
    IM=tf.TiffFile(path+'/morphology_mip.ome.tif')
    position1_series = IM.series[0]
    position1_series.axes
    position1 = position1_series.asarray()
    tf.imwrite(path+'/background.tiff',position1)
    
    
[docs]def keep_nuclei(adata1,overlaps_nucleus=1):
    """ Redefine cells in AnnData to keep only nuclear reads   

    Args:
        adata1(AnnData): AnnData object with the cells of the experiment.

        overlaps_nucleus(int): whether to keep only nuclear reads only (1) or cytoplasmic reads (0) in the redefinition of cells.

    results:
        adata: AnnData object with the formated cells
    """

    subset1=adata1.uns['spots'].loc[adata1.uns['spots']['overlaps_nucleus']==overlaps_nucleus,:]
    ct1=pd.crosstab(subset1['cell_id'],subset1['feature_name'])
    adataobs=adata1.obs.loc[adata1.obs['cell_id'].isin(ct1.index),:]
    ct1=ct1.loc[:,adata1.var.index]
    adataobs.index=adataobs['cell_id']
    adataobs.index.name='ind'
    ct1=ct1.loc[ct1.index.isin(adataobs['cell_id']),:]
    adata1nuc=sc.AnnData(np.array(ct1),obs=adataobs,var=adata1.var)
    return adata1nuc
    
[docs]def cell_area(adata_sp: AnnData,pipeline_output=True):
    """Calculates the area of the region imaged using convex hull and divide total number of cells/area. XY position should be in um2  

    Args:
        adata_sp : AnnData, annotated ``AnnData`` object with counts from spatial data.

        pipeline_output : float, optional, boolean for whether to create the pipeline output. 

    results:
        density : float

        Cell density (cells/um)
    """   

    hull = ConvexHull(np.array(adata_sp.uns['spots'].loc[:,['x','y']]))
    area=hull.area#*10e6
    return area


[docs]def generate_random_color_variation(base_color, deviation=0.17):
    """ Generate variations of a reference color   

    Args:
        base_color (str):reference hex color.

        deviation(float): deviation from the base color that the resulting color should have.

    results:
        modified_hex_color(str):resulting hex color.
    """
    
    base_rgb = mcolors.hex2color(base_color)
    h, s, v = mcolors.rgb_to_hsv(base_rgb)

    # Make random adjustments to the hue, saturation, and value
    h += random.uniform(-deviation, deviation)
    s = max(0, min(1, s + random.uniform(-deviation, deviation)))
    v = max(0, min(1, v + random.uniform(-deviation, deviation)))

    # Ensure the values are within the valid HSV range
    h = h % 1.0

    modified_rgb = mcolors.hsv_to_rgb((h, s, v))
    modified_hex_color = mcolors.to_hex(modified_rgb)

    return modified_hex_color


[docs]def format_data_neighs(adata,sname,condit,neighs=10):
    """ Redefine the expression of cells in adata by counting the neighnoring cell types of each cell  

    Args:
        adata (AnnData): AnnData object with the cells of the experiment.

        sname(str): column in adata.obs where the cluster assigned to each cells are stored.

        neighs(int): number of neighbors to consider when computing neighboring cells.

    results:
        adata1 (AnnData): AnnData object with neighboring cell types included in a cell-by-celltype matrix.
    """

    try:
        adata.obsm['spatial']
    except:
        adata.obsm["spatial"]=np.array([adata.obs.X,adata.obs.Y]).transpose().astype('float64')
    adata_copy_int=adata
    sq.gr.spatial_neighbors(adata_copy_int,n_neighs=neighs)
    result=np.zeros([adata.shape[0],len(adata_copy_int.obs[sname].unique())])
    n=0
    tr=adata_copy_int.obsp['spatial_distances'].transpose()
    tr2=tr>0
    from tqdm import tqdm
    for g in tqdm(adata_copy_int.obs[sname].unique()):
        epv=adata_copy_int.obs[sname]==g*1
        opv=list(epv*1)
        result[:,n]=tr2.dot(opv)
        n=n+1
    expmat=pd.DataFrame(result,columns=adata_copy_int.obs[sname].unique())
    adata1=sc.AnnData(expmat,obs=adata.obs)
    #adata1.obs['sample']=condit
    adata1.obs['condition']=condit
    return adata1

[docs]def format_data_neighs_colapse(adata,sname,condit,neighs=10):
    """ Redefine the expression of cells in adata by collapsing the expression of its neighbors into each cell (a.k.a pseudobining)    

    Args:
        adata (AnnData): AnnData object with the cells of the experiment.

        sname(str): column in adata.obs where sample is stored.

        condit(str): column in adata.obs where the sample each cell belongs to is stored.

        neighs(int): number of neighbors to consider when collapsing the expression of neighboring cells.

    results:
        adata1 (AnnData): AnnData object with expression of cells collapsed from neighboring cells.
    """

    adata.obsm["spatial"]=np.array([adata.obs.X,adata.obs.Y]).transpose().astype('float64')
    adata_copy_int=adata
    sq.gr.spatial_neighbors(adata_copy_int,n_neighs=neighs)
    result=np.zeros([adata.shape[0],adata.shape[1]])
    n=0
    tr=adata_copy_int.obsp['spatial_distances'].transpose()
    tr2=tr>0
    exp=adata_copy_int.to_df()
    from tqdm import tqdm
    #tdd=tr2.todense()
    for i in tqdm(range(0,adata_copy_int.to_df().shape[0])):
        result[i,:]=np.sum(exp[tr2[i,:].todense().transpose()],axis=0)
    adata1=sc.AnnData(result,obs=adata.obs,var=adata.var)
    return adata1



[docs]def format_xenium_adata_final(path,tag,output_path,use_parquet=True,save=True):
    """ Format xenium data (output from the machine) to adata format using the official up-to-date Xenium format   

    Args:
        path(str): path to the folder where the output of the Xenium machine is stored, if requested.

        tag(str): sample tag to be added to be added to all cells formated from the section.

        output_path(str): path where to store the resulting adata object.

        use_parquet(boolean): whether to use parquet files as an input to generate the AnnData File. (it's way faster).

        save(boolean): whether to save the resulting object.

    results:
        adata: AnnData object with the formated cells

    """
    if os.path.isfile(path+'/cell_feature_matrix/barcodes.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/barcodes.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/barcodes.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/features.tsv')==False:
        with gzip.open(path+'/cell_feature_matrix/features.tsv.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/features.tsv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cell_feature_matrix/matrix.mtx')==False:
        with gzip.open(path+'/cell_feature_matrix/matrix.mtx.gz', 'rb') as f_in:
            with open(path+'/cell_feature_matrix/matrix.mtx', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if os.path.isfile(path+'/cells.csv')==False:
        with gzip.open(path+'/cells.csv.gz', 'rb') as f_in:
            with open(path+'/cells.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    a = mmread(path+'/cell_feature_matrix/matrix.mtx')
    ad=a.todense()
    cell_info=pd.read_csv(path+r"/cells.csv")
    features=pd.read_csv(path+'/cell_feature_matrix/features.tsv',sep='\t',header=None,index_col=0)
    barcodes=pd.read_csv(path+'/cell_feature_matrix/barcodes.tsv',header=None,index_col=0)
    adata=sc.AnnData(ad.transpose(),obs=cell_info,var=features)
    adata.var.index.name='index'
    adata.var.columns=['gene_id','reason_of_inclusion']
    f = open(path+'/gene_panel.json')
    data = json.load(f)
    geness=[]
    idss=[]
    descriptorss=[]
    for r in range(len(data['payload']['targets'])):
        geness.append(data['payload']['targets'][r]['type']['data']['name'])
        try:
            idss.append(data['payload']['targets'][r]['type']['data']['id'])
        except:
            idss.append('newid_'+str(r)) 
        try:
            descriptorss.append(data['payload']['targets'][r]['type']['descriptor'])
        except:
            descriptorss.append('other')
    f.close()

    dict_inpanel=dict(zip(geness,descriptorss))
    dict_ENSEMBL=dict(zip(geness,idss))
    adata.var['Ensembl ID']=adata.var['gene_id'].map(dict_ENSEMBL)
    adata.var['in_panel']=adata.var['gene_id'].map(dict_inpanel)
    if use_parquet==True:
        transcripts=pd.read_parquet(path+'/transcripts.parquet')
    else:
        if os.path.isfile(path+'/transcripts.csv')==False:
            with gzip.open(path+'/transcripts.csv.gz', 'rb') as f_in:
                with open(path+'/transcripts.csv', 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        transcripts=pd.read_csv(path+'/transcripts.csv')
    adata.uns['spots']=transcripts
    
    UMAP=pd.read_csv(path+'/analysis/umap/gene_expression_2_components/projection.csv',index_col=0)
    id2UMAP1=dict(zip(UMAP.index,UMAP.iloc[:,0]))
    id2UMAP2=dict(zip(UMAP.index,UMAP.iloc[:,1]))
    adata.obsm['X_umap_original']=np.array([adata.obs['cell_id'].map(id2UMAP1),adata.obs['cell_id'].map(id2UMAP2)]).transpose()
    PCA=pd.read_csv(path+'/analysis/pca/gene_expression_10_components/projection.csv',index_col=0)
    id2PCA1=dict(zip(PCA.index,PCA.iloc[:,0]))
    id2PCA2=dict(zip(PCA.index,PCA.iloc[:,1]))
    adata.obsm['X_pca_10_comp']=np.array([adata.obs['cell_id'].map(id2PCA1),adata.obs['cell_id'].map(id2PCA2)]).transpose()
    dici={'/analysis/clustering/gene_expression_graphclust/clusters.csv':'graph_clusters',
           '/analysis/clustering/gene_expression_kmeans_2_clusters/clusters.csv':'kmeans2',
           '/analysis/clustering/gene_expression_kmeans_3_clusters/clusters.csv':'kmeans3',
           '/analysis/clustering/gene_expression_kmeans_4_clusters/clusters.csv':'kmeans4',
           '/analysis/clustering/gene_expression_kmeans_5_clusters/clusters.csv':'kmeans5',
           '/analysis/clustering/gene_expression_kmeans_6_clusters/clusters.csv':'kmeans6',
           '/analysis/clustering/gene_expression_kmeans_7_clusters/clusters.csv':'kmeans7',
           '/analysis/clustering/gene_expression_kmeans_8_clusters/clusters.csv':'kmeans8',
           '/analysis/clustering/gene_expression_kmeans_9_clusters/clusters.csv':'kmeans9',
          '/analysis/clustering/gene_expression_kmeans_10_clusters/clusters.csv':'kmeans10'
    }
    for subpath in dici.keys():
        clust=pd.read_csv(path+subpath,index_col=0)
        clustdict=dict(zip(clust.index,clust.iloc[:,0]))
        adata.obs[dici[subpath]]=adata.obs['cell_id'].map(clustdict).astype(str)
    adata.X=sp.sparse.csr_matrix(adata.X)
    adata.uns['spots']=adata.uns['spots'].fillna(0)
    adata.uns['spots']['fov_name']=adata.uns['spots']['fov_name'].astype(str)
    adata.obsm['spatial']=np.array([adata.obs['x_centroid'],adata.obs['y_centroid']]).transpose()
    adata.obs['sample']=tag
    if save==True:
        adata.write(output_path+tag+'_original.h5ad')
    return adata

[docs]def keep_nuclei_and_quality(adata1,tag:str,max_nucleus_distance=1,min_quality=20,save=True,output_path=''):
    """ Redefine cell expression based on nuclei expression an quality of detected reads  

    Args:
        adata1 (AnnData): AnnData object with the cells of the experiment before filtereing reads based on quality or nuclear/non-nuclear.

        tag (str): sample tag to added in the name of the saved filed, if needed.

        save(boolean): whether to save the resulting files.

        output_path(str): if needed, where to save the resulting files.

        max_nucleus_distance(float): Maximum distance from the nuclei for reads to be kept in redefined cells.

        min_quality(float): Define minimum quality (qv) of reads to keep in the analysis. 

    results:
        adata1nuc(AnnData): AnnData object with the cells redefined based to input parameters.
    """

    if max_nucleus_distance==0:
        subset1=adata1.uns['spots'].loc[adata1.uns['spots']['overlaps_nucleus']==overlaps_nucleus,:]
    if max_nucleus_distance>0:
        subset1=adata1.uns['spots'].loc[adata1.uns['spots']['nucleus_distance']<max_nucleus_distance,:]
    subset1=subset1[subset1['qv']>min_quality]
    ct1=pd.crosstab(subset1['cell_id'],subset1['feature_name'])
    adataobs=adata1.obs.loc[adata1.obs['cell_id'].isin(ct1.index),:]
    av=adata1.var[adata1.var['gene_id'].isin(ct1.columns)]#.isin(adata1.var.index)
    ct1=ct1.loc[:,av['gene_id']]
    adataobs.index=adataobs['cell_id']
    adataobs.index.name='ind'
    ct1=ct1.loc[ct1.index.isin(adataobs['cell_id']),:]
    adata1nuc=sc.AnnData(np.array(ct1),obs=adataobs,var=av)
    if save==True:
        adata1nuc.write(output_path+tag+'_filtered.h5ad')
    adata1nuc.obsm['spatial']=np.array([adata1nuc.obs['x_centroid'],adata1nuc.obs['y_centroid']]).transpose()
    return adata1nuc

[docs]def format_to_adata(files:list,output_path:str,use_parquet=True,save=False,max_nucleus_distance=0,min_quality=10):
    """ Format xenium datasets (outputs from the machine, up to date 2024) to adata files and filter reads based on quality parameters  

    Args:
        files(list): list including the paths where  the Xenium outputs are saved for each sample (output from the machine).

        output_path(str): path where to store the resulting adata object.

        use_parquet(boolean): whether to use parquet files as an input to generate the AnnData File. (it's way faster).

        save(boolean): whether to save the resulting object.

        max_nucleus_distance: Maximum distance from the nuclei for reads to be kept in redefined cells.

        min_quality(float): Define minimum quality (qv) of reads to keep in the analysis. 

    results:
        adata: AnnData object with the formated cells with only reads that passed the filters established.
    """
    
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    alladata=[]
    for f in files:
        tag=f.split('/')[-1]
        print(f'Formatting {tag}')
        adata=format_xenium_adata_final(f,tag,output_path,use_parquet=use_parquet,save=save)
        print('Filter reads')
        adata_f1=keep_nuclei_and_quality(adata,tag=tag,max_nucleus_distance=max_nucleus_distance,min_quality=min_quality,save=save,output_path=output_path)
        #xf.format_background(f)
        alladata.append(adata_f1)
    adata=sc.concat(alladata,join='outer')
    adata.var=alladata[0].var
    adata.obs['total_counts']=np.sum(adata.X,axis=1)
    adata.obs['expressed_genes']=np.sum(adata.X>0,axis=1)
    adata.obs['unique_cell_id']=adata.obs['cell_id'].astype(str)+'_'+adata.obs['sample'].astype(str)
    adata.var['ENSMBL_ID']=adata.var.index
    adata.var.index=adata.var['gene_id']
    if save==True:
        adata.write(output_path+'combined_filtered.h5ad')
    return adata


[docs]def prep_xenium_data_for_baysor(XENIUM_DIR:str, OUT_DIR:str,CROP=True, COORDS=[15000, 16000, 15000, 16000]):
    """ Format xenium datasets for its use for baysor segmentation   

    Args:
        XENIUM_DIR(list): path where  the Xenium output is saved for each sample (output from the machine).

        OUT_DIR(str): path where to store the resulting adata object.

        CROP(boolean): whether to use a small Region of interest for segmentation.

        COORDS(list): if CROP is used, coordinates of the crop in the form of [YMIN,YMAX,XMIN,XMAX].

    results:
        None.
    """
    
    OUT_DIR=Path(str(OUT_DIR)+'/'+str(XENIUM_DIR.split('/')[-1:][0])+'_baysor')
    XENIUM_DIR=Path(XENIUM_DIR)
    # Get shape of full image
    if not CROP:
        YMAX, XMAX = get_image_shape(XENIUM_DIR / "morphology.ome.tif") 
        YMIN, XMIN = 0, 0
    # Or define a crop
    else:
        YMIN=COORDS[0]
        YMAX=COORDS[1]
        XMIN=COORDS[2]
        XMAX=COORDS[3]


    #-------------------------------------------------------------

    # Create output directory
    OUT_DIR.mkdir(exist_ok=True)
    print("CHECKPOINT 1: Output directory created")

    # Load spots
    spots = pd.read_parquet(XENIUM_DIR / "transcripts.parquet")
    print("CHECKPOINT 2: Spots loaded")

    # Convert physical units of spots to pixel units
    phys_sizes = extract_physical_sizes(XENIUM_DIR / "morphology.ome.tif")
    phys_sizes = phys_sizes[list(phys_sizes.keys())[0]]
    phys_sizes = {k:float(v) for k,v in phys_sizes.items()}
    assert phys_sizes["PhysicalSizeX"] == phys_sizes["PhysicalSizeY"]
    resolution = phys_sizes["PhysicalSizeX"]
    spots["x_location"] /= resolution
    spots["y_location"] /= resolution
    # Note: we scale z µm the same way as x and y because the physical length for z pixels is larger than for xy pixels
    # and Baysor assumes euclidean distances in space.
    if ("z_location" in spots): 
        spots["z_location"] /= resolution
    print("CHECKPOINT 3: Spots converted to pixel units")  
    if CROP:
        # Subset spots to crop
        spots = spots.loc[
            (spots["y_location"] >= YMIN) & (spots["y_location"] < YMAX) & 
            (spots["x_location"] >= XMIN) & (spots["x_location"] < XMAX)
        ]
        
        # Offset positions if YMIN, XMIN are not 0
        spots["x_location"] -= XMIN
        spots["y_location"] -= YMIN
    print("CHECKPOINT 4: Spots cropped")

    # Load polygons
    df_nuc = pd.read_parquet(XENIUM_DIR / "nucleus_boundaries.parquet")
    print("CHECKPOINT 5: Polygons loaded")

    # Convert physical units of polygons to pixel units
    df_nuc["vertex_x"] /= resolution
    df_nuc["vertex_y"] /= resolution

    if CROP:
        # Subset polygons to crop, NOTE: Polygons at border will be cut off
        df_nuc = df_nuc.loc[
            (df_nuc["vertex_y"] >= YMIN) & (df_nuc["vertex_y"] < YMAX) & 
            (df_nuc["vertex_x"] >= XMIN) & (df_nuc["vertex_x"] < XMAX)
        ]
        # Offset positions if YMIN, XMIN are not 0
        df_nuc["vertex_x"] -= XMIN
        df_nuc["vertex_y"] -= YMIN
    print("CHECKPOINT 6: Polygons cropped")

    # Create label image
    
    series = pd.Series(df_nuc['cell_id'])
    unique_numbers, _ = pd.factorize(series, sort=True)
    df_nuc['label_id']=unique_numbers+1
    
    label_image = convert_polygons_to_label_image_xenium(df_nuc, (YMAX-YMIN, XMAX-XMIN),label_col='label_id')
    print("CHECKPOINT 7: Label image created")

    # Save spots, polygons and label image
    spots.to_csv(OUT_DIR / "spots.csv", index=False)
    print("CHECKPOINT 8: Spots saved")
    df_nuc.to_csv(OUT_DIR / "polygons.csv", index=False)
    print("CHECKPOINT 9: Polygons saved")
    tifffile.imwrite(OUT_DIR / "label_image.tif", label_image)
    print("CHECKPOINT 10: Label image saved")

[docs]def batch_prep_xenium_data_for_baysor(files,outpath,CROP=True, COORDS=[1000, 5000, 1000, 5000]):
    """ Running the function prep_xenium_data_for_baysor for multiple samples   

    Args:
        files(list): list including the paths where  the Xenium outputs are saved for each sample (output from the machine).

        outpath(str): path where to store the resulting adata object.

        CROP(boolean): whether to use a small Region of interest for segmentation.

        COORDS(list): if CROP is used, coordinates of the crop in the form of [YMIN,YMAX,XMIN,XMAX].
    results:
        None.
    """

    for f in files:
        prep_xenium_data_for_baysor(f, output_path,CROP=False)
        
[docs]def format_baysor_output_to_adata(path:str,output_path:str):
    """Format baysor's output to anndata  
    
    Args:
        path (AnnData): path to the folder where baysor's output is stored
        output_path(str): path where to store the generated adata

    results:
        adata (AnnData): AnnData object with the cells of the experiment
    """

    read_info=pd.read_csv(path+'/segmentation.csv')
    cell_stats=pd.read_csv(path+'/segmentation_cell_stats.csv')
    read_info['cell_id']=read_info['cell']
    expression=pd.crosstab(read_info['cell'],read_info['gene'])
    cell_stats=cell_stats.loc[cell_stats['cell'].isin(expression.index),:]
    expression=expression.loc[expression.index.isin(cell_stats['cell']),:]
    expression=expression.loc[cell_stats['cell'],:]
    adata=sc.AnnData(expression)
    adata.obs=cell_stats
    adata.uns['spots']=read_info
    adata.obsm['spatial']=np.array(adata.obs.loc[:,['x','y']])
    adata.obs['sample']=str(path.split('/')[-1:][0])
    adata.uns['spots']['cell_id']=adata.uns['spots']['cell_id'].astype(str)
    adata.uns['spots']['cell']=adata.uns['spots']['cell'].astype(str)
    adata.write(output_path+str(path.split('/')[-1:][0])+'.h5ad')
    return adata