python/salus/openstmerge.py

   1 #!/usr/bin/env python3
   2
   3 import sys
   4 import os
   5 import glob
   6 from xopen import xopen
   7 import fast_matrix_market as fmm
   8 import scipy.sparse
   9 import numpy as np
  10 import scanpy as sc
  11 import anndata as ad
  12 import pandas as pd
  13
  14 import logging
  15 logging.basicConfig(format='%(asctime)s.%(msecs)03d %(levelname)-8s|%(message)s\t|%(module)s:%(funcName)s:%(lineno)d|%(process)d:%(thread)d|%(pathname)s',
  16     datefmt='%Y-%m-%dT%H:%M:%S',level=logging.INFO)
  17 logger = logging.getLogger(__name__)
  18
  19 from concurrent.futures import ThreadPoolExecutor, as_completed
  20 def export_adata(adata, path):
  21     """
  22     Export AnnData object to 10x Genomics format with gzip compression using xopen.
  23     Parameters:
  24     adata : AnnData
  25         The AnnData object to export.
  26     path : str
  27         The directory where the files should be saved.
  28     """
  29     # Create destination directory if it doesn't exist
  30     os.makedirs(path, exist_ok=True)
  31     # Step 1: Append spatial coordinates directly to obs
  32     if 'spatial' in adata.obsm:
  33         adata.obs['m_spatial_x'] = adata.obsm['spatial'][:, 0]  # X coordinates
  34         adata.obs['m_spatial_y'] = adata.obsm['spatial'][:, 1]  # Y coordinates
  35     # Prepare file paths
  36     genes_path = os.path.join(path, "features.tsv.gz")
  37     barcodes_path = os.path.join(path, "barcodes.tsv.gz")
  38     spatial_path = os.path.join(path, "spatial.txt.gz")
  39     matrix_path = os.path.join(path, "matrix.mtx.gz")
  40     # Prepare dataframes for writing
  41     genes_df = pd.DataFrame({
  42         'gene_name': adata.var.index,
  43         'feature_types': "Gene Expression"
  44     }, index=['g' + str(i + 1) for i in range(len(adata.var.index))])  # Set gene_id as index
  45     barcodes_with_spatial = adata.obs[['m_spatial_x', 'm_spatial_y']]
  46     obs_df = adata.obs
  47     # Ensure that adata.X is a sparse matrix
  48     if not isinstance(adata.X, scipy.sparse.spmatrix):
  49         adata.X = scipy.sparse.csr_matrix(adata.X)
  50     # Check equality of float and int representations
  51     float_matrix = adata.X
  52     int_matrix = float_matrix.astype(int)
  53     # If the two matrices are equal, convert to integer
  54     if (float_matrix != int_matrix).nnz == 0:
  55         adata.X = int_matrix  # Set X to its integer representation
  56
  57     compression_parameters = {'method': 'gzip', 'mtime': 1}
  58     def write_genes():
  59         genes_df.to_csv(
  60             genes_path,
  61             sep="\t",
  62             index=True,  # Write index as well, which will now be gene_id
  63             header=False,
  64             compression=compression_parameters
  65         )
  66     def write_barcodes():
  67         # Write only the index (barcodes) to the file, without a header
  68         barcodes_with_spatial.index.to_series().to_csv(
  69             barcodes_path,
  70             sep="\t",
  71             index=False,
  72             header=False,
  73             compression=compression_parameters
  74         )
  75     def write_spatial():
  76         barcodes_with_spatial.to_csv(
  77             spatial_path,
  78             sep=" ",
  79             index=True,
  80             header=False,
  81             compression={**compression_parameters, 'compresslevel': 1}
  82         )
  83     def write_matrix():
  84         with xopen(matrix_path, 'wb') as f:
  85             fmm.mmwrite(f, adata.X.T)
  86
  87     # Use ThreadPoolExecutor to write files concurrently
  88     with ThreadPoolExecutor(max_workers=4) as executor:
  89         futures = {
  90             executor.submit(write_genes): "genes",
  91             executor.submit(write_barcodes): "barcodes",
  92             executor.submit(write_spatial): "spatial",
  93             executor.submit(write_matrix): "matrix"
  94         }
  95         for future in as_completed(futures):
  96             task_name = futures[future]
  97             future.result()  # Get result to raise exceptions if any
  98 # export_adata(combined_adata, 'path/to/output_directory')
  99
 100 def cs_read(incsv, dgepath):
 101     csdf = pd.read_csv(incsv)
 102     csdf['h5ad_path'] = pd.NA
 103     pattern = 'dge.all.polyA_adapter_trimmed.mm_included.spatial_beads_*.h5ad'
 104     search_files = glob.glob(os.path.join(dgepath, pattern))
 105     for file_path in search_files:
 106         base_name = os.path.basename(file_path)
 107         puck_id = base_name.split('.')[-2].replace('spatial_beads_', '')
 108         if puck_id in csdf['puck_id'].values:
 109             # Get the corresponding row
 110             row_index = csdf[csdf['puck_id'] == puck_id].index[0]
 111             csdf.at[row_index, 'h5ad_path'] = file_path
 112             x_offset = csdf.at[row_index, 'x_offset']
 113             y_offset = csdf.at[row_index, 'y_offset']
 114             #print(f"[{puck_id}, {x_offset}, {y_offset}] {file_path}")
 115     csdfused = csdf[csdf['h5ad_path'].notna()].drop(columns=['z_offset'])
 116     return csdfused
 117
 118 def sp_concat(csdf):
 119     adatas = []
 120     for index, row in csdf.iterrows():
 121         h5ad_path = row['h5ad_path']
 122         adata = sc.read_h5ad(h5ad_path)
 123         logger.info(f'Load anndata {row["puck_id"]}: {adata.shape}')
 124         # Apply x_offset and y_offset
 125         x_offset = row['x_offset']
 126         y_offset = row['y_offset']
 127         # Shift the spatial coordinates
 128         if 'spatial' in adata.obsm.keys():
 129             # Create a 1D array for offsets
 130             offsets = np.array([x_offset, y_offset])
 131             adata.obsm['spatial'] += offsets  # Adjust the spatial coordinates
 132             logger.info(f'Shift anndata {row["puck_id"]}: x+={x_offset}, y+={y_offset}')
 133         # Append the modified AnnData object to the list
 134         adatas.append(adata)
 135     # Step 2: Use anndata.concat to merge all AnnData objects with outer join on .var
 136     combined_adata = ad.concat(adatas, join='outer', label='source')
 137     return combined_adata
 138
 139 def main() -> None:
 140     if len(sys.argv) < 4:
 141         print(f'Usage: {sys.argv[0]} <coordinate_system.csv> <spacemake dge path> <outpath>', file=sys.stderr, flush=True)
 142         exit(0);
 143     elif len(sys.argv) >= 4:
 144         incsv = sys.argv[1]
 145         dgepath = sys.argv[2]
 146         outpath = sys.argv[3]
 147         logger.info(f'Coordinate:[{incsv}],DGE:[{dgepath}] => [{outpath}]/xxx')
 148     csdf = cs_read(incsv,dgepath)
 149     logger.info(f'Load Coordinate_system: {len(csdf)}')
 150     print(csdf)
 151     os.makedirs(outpath, exist_ok=True)
 152     combined_adata = sp_concat(csdf)
 153     logger.info(f'Merged anndata: {combined_adata.shape}')
 154     h5adfilename = os.path.join(outpath, 'raw_puck_collection.h5ad')
 155     combined_adata.write_h5ad(h5adfilename,compression='lzf')
 156     logger.info(f'Saved anndata: {h5adfilename}')
 157     mtxpath = os.path.join(outpath, 'raw')
 158     export_adata(combined_adata, mtxpath)
 159     logger.info(f'Exported anndata: {mtxpath}')
 160
 161 if __name__ == "__main__":
 162     main()  # ./openstmerge.py puck_data/fc_2_coordinate_system.csv projects/mbrain/processed_data/openst_musbrain/illumina/complete_data/dge combined_mtx4