gao-lab
diff --git a/‎DEVELOPER.md‎
Lines changed: 0 additions & 39 deletions b/‎DEVELOPER.md‎
Lines changed: 0 additions & 39 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎decipher/cls.py‎
Lines changed: 5 additions & 44 deletions b/‎decipher/cls.py‎
Lines changed: 5 additions & 44 deletions
diff --git a/‎decipher/data/mnn_dataset.py‎
Lines changed: 11 additions & 24 deletions b/‎decipher/data/mnn_dataset.py‎
Lines changed: 11 additions & 24 deletions
diff --git a/‎decipher/ddp.py‎
Lines changed: 1 addition & 0 deletions b/‎decipher/ddp.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎decipher/emb.py‎
Lines changed: 9 additions & 20 deletions b/‎decipher/emb.py‎
Lines changed: 9 additions & 20 deletions
diff --git a/‎decipher/graphic/knn.py‎
Lines changed: 4 additions & 8 deletions b/‎decipher/graphic/knn.py‎
Lines changed: 4 additions & 8 deletions
@@ -37,7 +37,7 @@ install_pyg_dependencies
 (Optional) You can install [RAPIDS](https://docs.rapids.ai/install) to accelerate visualization.
 
 ```sh
-mamba create -n decipher -c conda-forge -c rapidsai -c nvidia python=3.11 rapids=24.12 'cuda-version>=12.0,<=12.2' -y && conda activate decipher
+mamba create -n decipher -c conda-forge -c rapidsai -c nvidia python=3.11 rapids=25.06 'cuda-version>=12.0,<=12.8' -y && conda activate decipher
 pip install cell-decipher
 install_pyg_dependencies
 ```
 
@@ -8,10 +8,8 @@
 
 import numpy as np
 import pandas as pd
-import scanpy as sc
 import scipy.sparse as sps
 import torch
-import torch.nn.functional as F
 import yaml
 from addict import Dict
 from anndata import AnnData
@@ -26,8 +24,7 @@
 from .explain.gene.mixin import GeneSelectMixin
 from .explain.regress.mixin import RegressMixin
 from .graphic.build import build_graph
-from .plot import plot_sc
-from .utils import CFG, estimate_spot_size, global_seed, scanpy_viz, sync_config
+from .utils import CFG, global_seed, l2norm, sync_config
 
 
 class DECIPHER(RegressMixin, GeneSelectMixin, MNNMixin, DDPMixin):
@@ -157,60 +154,24 @@ def fit_omics(self) -> None:
             mnn_dataset = MNNDataset(self.x, self.valid_cellidx, self.mnn_dict)
             logger.info(f"Using MNN with {len(np.unique(self.batch))} batches.")
         # train model
-        sc_model, center_emb_pretrain = sc_emb(
-            self.x, self.cfg.omics, mnn_dataset, self.meta, self.batch
-        )
+        sc_model, center_emb_pretrain = sc_emb(self.x, self.cfg.omics, mnn_dataset, self.batch)
         center_emb, self.nbr_emb = spatial_emb(
             self.x,
             self.edge_index,
             self.cfg.omics,
             mnn_dataset,
-            self.meta,
             sc_model,
             self.batch,
         )
         self.center_emb = center_emb_pretrain if center_emb_pretrain else center_emb
-        # as float
-        self.center_emb = self.center_emb.astype(np.float32)
-        self.nbr_emb = self.nbr_emb.astype(np.float32)
+        # norm
+        self.center_emb = l2norm(self.center_emb.astype(np.float32))
+        self.nbr_emb = l2norm(self.nbr_emb.astype(np.float32))
         # save embeddings
         np.save(self.work_dir / "center_emb.npy", self.center_emb)
         np.save(self.work_dir / "nbr_emb.npy", self.nbr_emb)
         logger.info(f"Results saved to {self.work_dir}")
 
-    def visualize(self, resolution: float = 0.5) -> None:
-        r"""
-        Visualize results, should run after `fit_omics`
-
-        Parameters
-        ----------
-        resolution
-            resolution for clustering
-        """
-        if (self.work_dir / "embedding.h5ad").exists():
-            adata = sc.read_h5ad(self.work_dir / "embedding.h5ad")
-        else:
-            norm_center = F.normalize(torch.tensor(self.center_emb)).numpy()
-            norm_nbr = F.normalize(torch.tensor(self.nbr_emb)).numpy()
-            adata = sc.AnnData(
-                X=np.zeros((self.center_emb.shape[0], 1)),
-                obsm={
-                    "X_center": self.center_emb,
-                    "X_nbr": self.nbr_emb,
-                    "X_merge": np.hstack([norm_center, norm_nbr]),
-                    "spatial": self.coords,
-                },
-                obs=self.meta.astype(str),
-            )
-            adata.uns["spot_size"] = estimate_spot_size(adata.obsm["spatial"])
-            adata = scanpy_viz(adata, resolution=resolution)
-            adata.write_h5ad(self.work_dir / "embedding.h5ad")
-        color_vars = ["leiden_center", "leiden_nbr"]
-        for var in ["_celltype", "_batch"]:
-            if var in adata.obs.columns:
-                color_vars.append(var)
-        plot_sc(adata, color_vars)
-
     def load(self, from_dir: str | Path = None) -> None:
         r"""
         Load saved results, should run after `register_data`
 
@@ -19,13 +19,6 @@
 from torch_geometric.data import Data
 from torch_geometric.data.lightning import LightningNodeData
 
-try:
-    import cupy as cp
-
-    CUPY_AVAILABLE = True
-except ImportError:
-    CUPY_AVAILABLE = False
-
 from ..graphic.knn import knn
 from ..utils import l2norm
 
@@ -175,7 +168,7 @@ def train_dataloader(self):
             combined_loader = CombinedLoader(loaders, mode="max_size_cycle")
             return combined_loader
 
-    def val_dataloader(self):
+    def test_dataloader(self):
         val_cfg = deepcopy(self.loader_config)
         val_cfg.update({"batch_size": 1024, "shuffle": False, "drop_last": False})
         return DataLoader(self.val_dataset, **val_cfg)
@@ -275,7 +268,7 @@ def svd(x: np.ndarray, y: np.ndarray, k_components: int = 20) -> tuple[np.ndarra
     """
     logger.debug(f"x shape: {x.shape}, y shape: {y.shape}")
     if x.shape[0] > 1_000_000 or y.shape[0] > 1_000_000:
-        logger.debug("Use harmony for large dataset.")
+        logger.debug("Use harmony-based SVD for large dataset.")
         from harmony import harmonize
 
         # batch
@@ -287,22 +280,16 @@ def svd(x: np.ndarray, y: np.ndarray, k_components: int = 20) -> tuple[np.ndarra
         # harmonize
         z_norm = harmonize(z, batch, "batch", use_gpu=True)
         return z_norm
-    elif x.shape[0] > 200_000 or y.shape[0] > 200_000:
-        logger.debug("Use CPU for middle dataset")
-        dot = torch.from_numpy(x) @ torch.from_numpy(y).T  # faster than np
+
+    try:
+        dot = torch.from_numpy(x).cuda().half() @ torch.from_numpy(y).T.cuda().half()
+        dot = dot.cpu().float().numpy()
+        logger.info("Use CUDA for small dataset")
+    except:  # noqa
+        logger.error(f"CUDA failed: {x.shape}, {y.shape}, use CPU instead.")
+        dot = torch.from_numpy(x) @ torch.from_numpy(y).T
         dot = dot.numpy()
-    else:
-        try:
-            dot = torch.from_numpy(x).cuda().half() @ torch.from_numpy(y).T.cuda().half()
-            if CUPY_AVAILABLE:
-                dot = cp.asarray(dot.to(torch.float32)).get()
-            else:
-                dot = dot.cpu().float().numpy()
-            logger.info("Use CUDA for small dataset")
-        except:  # noqa
-            logger.error("CUDA failed")
-            dot = torch.from_numpy(x) @ torch.from_numpy(y).T
-            dot = dot.numpy()
+    torch.cuda.empty_cache()
     u, s, vh = randomized_svd(dot, n_components=k_components, random_state=0)
     z = np.vstack([u, vh.T])  # gene x k_components
     z = z @ np.sqrt(np.diag(s))  # will reduce the MNN pairs number greatly
 
@@ -41,6 +41,7 @@ def fit_ddp(self, gpus: int = -1, ddp_pretrain: bool = False) -> None:
             logger.warning("Using DDP with < 500k cells is not recommended.")
 
         max_gpus = torch.cuda.device_count()
+        assert max_gpus > 1, "DDP requires at least 2 GPUs."
         gpus = min(gpus, max_gpus) if gpus > 0 else max_gpus
 
         if ddp_pretrain:
 
@@ -6,7 +6,6 @@
 from pathlib import Path
 
 import numpy as np
-import pandas as pd
 import torch
 from addict import Dict
 from loguru import logger
@@ -58,7 +57,6 @@ def spatial_emb(
     spatial_edge: Tensor,
     config: Dict,
     mnn_dataset: Dataset = None,
-    meta: pd.DataFrame = None,
     pretrained_model: ScSimCLR = None,
     batch: np.ndarray = None,
     DDP: bool = False,
@@ -76,8 +74,6 @@ def spatial_emb(
         model config
     mnn_dataset
         mnn dataset
-    meta
-        meta of cells
     pretrained_model
         pre-trained single cell model
     batch
@@ -99,9 +95,9 @@ def spatial_emb(
     datamodule = get_graph_datamodule(graph, config, mnn_dataset)
 
     if mnn_dataset is None:
-        model = OmicsSpatialSimCLR(config.model, meta)
+        model = OmicsSpatialSimCLR(config.model)
     else:
-        model = OmicsSpatialSimCLRMNN(config.model, meta)
+        model = OmicsSpatialSimCLRMNN(config.model)
 
     if pretrained_model is not None:
         model.center_encoder = deepcopy(pretrained_model.center_encoder)
@@ -121,7 +117,6 @@ def sc_emb(
     x: np.ndarray,
     config: Dict,
     mnn_dataset: Dataset = None,
-    meta: pd.DataFrame = None,
     batch: np.ndarray = None,
 ) -> tuple[ScSimCLR, np.ndarray | None]:
     r"""
@@ -135,8 +130,6 @@ def sc_emb(
         model config
     mnn_dataset:
         mnn dataset
-    meta:
-        meta of cells
     batch:
         batch index
 
@@ -152,7 +145,7 @@ def sc_emb(
     mnn_flag = True if mnn_dataset is not None else False
     if not config.pretrain.force:
         try:
-            return load_sc_model(config, mnn_flag, meta), None
+            return load_sc_model(config, mnn_flag), None
         except Exception as e:  # noqa
             logger.info(f"Not found pre-trained model: {e}")
 
@@ -166,9 +159,9 @@ def sc_emb(
     datamodule = LightningScMNNData(config.loader, train_dataset, val_dataset, mnn_dataset)
 
     if mnn_flag:
-        model = ScSimCLRMNN(meta, config.model)
+        model = ScSimCLRMNN(config.model)
     else:
-        model = ScSimCLR(meta, config.model)
+        model = ScSimCLR(config.model)
 
     if config.model.fix_sc:
         fit_and_inference(model, datamodule, config.model, show_name="single cell")
@@ -179,7 +172,7 @@ def sc_emb(
     return model, center_emb
 
 
-def load_sc_model(config, mnn_flag: bool, meta: pd.DataFrame = None):
+def load_sc_model(config, mnn_flag: bool):
     r"""
     Load omics encoder model
 
@@ -189,14 +182,12 @@ def load_sc_model(config, mnn_flag: bool, meta: pd.DataFrame = None):
         model config
     mnn_flag
         whether use mnn
-    meta
-        meta of cells
     """
     model_path = Path(config.model.work_dir) / "pretrain"
     # sort by modification time
     model_path = sorted(model_path.glob("*.ckpt"), key=os.path.getmtime)[-1]
     logger.info(f"Loading model from {model_path}")
-    kwargs = {"meta": meta, "config": config.model}
+    kwargs = {"config": config.model}
     if mnn_flag:
         sc_model = ScSimCLRMNN.load_from_checkpoint(model_path, **kwargs)
     else:
@@ -205,7 +196,7 @@ def load_sc_model(config, mnn_flag: bool, meta: pd.DataFrame = None):
     return sc_model
 
 
-def load_spatial_model(config, mnn_flag: bool, meta: pd.DataFrame = None):
+def load_spatial_model(config, mnn_flag: bool):
     r"""
     Load decipher spatial model
 
@@ -215,14 +206,12 @@ def load_spatial_model(config, mnn_flag: bool, meta: pd.DataFrame = None):
         model config
     mnn_flag
         whether use mnn
-    meta
-        meta of cells
     """
     model_path = Path(config.model.work_dir) / "model"
     model_path = sorted(model_path.glob("*.ckpt"), key=os.path.getmtime)[-1]
     logger.info(f"Loading model from {model_path}")
     config.model.device_num = 1
-    kwargs = {"config": config.model, "meta": meta}
+    kwargs = {"config": config.model}
     if mnn_flag:
         model = OmicsSpatialSimCLRMNN.load_from_checkpoint(model_path, **kwargs)
     else:
 
@@ -6,13 +6,7 @@
 from annoy import AnnoyIndex
 from loguru import logger
 
-try:
-    from cuml.neighbors import NearestNeighbors as cuNearestNeighbors
-
-    CUML_FLAG = True
-except ImportError:
-    CUML_FLAG = False
-    logger.warning("cuML is not available.")
+from ..utils import RSC_FLAG
 
 
 def knn(
@@ -52,7 +46,7 @@ def knn(
     if method == "auto":
         method = ["cuml", "faiss", "annoy"]
     method = method if isinstance(method, list) else [method]
-    if not CUML_FLAG and "cuml" in method:
+    if not RSC_FLAG and "cuml" in method:
         method.remove("cuml")
     if not approx and "annoy" in method:
         method.remove("annoy")
@@ -123,6 +117,8 @@ def knn_cuml(
     r"""
     Build k-NN graph by cuML
     """
+    from cuml.neighbors import NearestNeighbors as cuNearestNeighbors
+
     model = cuNearestNeighbors(n_neighbors=k, metric=metric)
     model.fit(ref)
     distances, indices = model.kneighbors(query)