Menu

Helper Module for Deep Learning.

Source code for pynet.datasets.kang

# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2021
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################

"""
Module that provides functions to prepare the Kang dataset.

H. M. Kang, et al., Multiplexed droplet single-cell rna-sequencing
using natural genetic variation. Nature biotechnology, 2018.
"""

# Imports
import os
import json
import logging
import subprocess
import requests
from collections import namedtuple
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pynet.datasets import Fetchers


# Global parameters
Item = namedtuple("Item", ["data", "trainset", "testset", "membership_mask"])
logger = logging.getLogger("pynet")


[docs]@Fetchers.register def fetch_kang(datasetdir, random_state=None): """ Download the Kang dataset described in [1]. [1] H. M. Kang, et al., Multiplexed droplet single-cell rna-sequencing using natural genetic variation.Nature biotechnology, 2018. Parameters ---------- datasetdir: str the dataset destination folder. random_state: int, default None controls the shuffling applied to the data before applying the split. Returns ------- item: namedtuple a named tuple containing 'data', 'trainset', 'testset' and 'membership_mask'. """ logger.info("Loading Kang dataset.") # Fisrt import specific modules try: import anndata except: raise ImportError("Please install anndata to use 'fetch_kang'.") # Download resources url_data = ("https://docs.google.com/uc?export=download&id=1-N7wPpYUf_" "QcG5566WVZlaxVC90M7NNE") url_gt = ("https://public.bmi.inf.ethz.ch/projects/2020/pmvae/" "kang_recons.h5ad") url_gmt = ("https://raw.githubusercontent.com/ratschlab/pmvae/main/data/" "c2.cp.reactome.v4.0.symbols.gmt") origdatapath = os.path.join(datasetdir, "orig_kang_count.h5ad") datapath = os.path.join(datasetdir, "kang_count.h5ad") gtpath = os.path.join(datasetdir, "kang_recons.h5ad") gmtpath = os.path.join(datasetdir, "c2.cp.reactome.v4.0.symbols.gmt") if not os.path.isdir(datasetdir): os.mkdir(datasetdir) if not os.path.isfile(origdatapath): cmd = ["wget", "--no-check-certificate", url_data, "-O", origdatapath] subprocess.check_call(cmd) if not os.path.isfile(datapath): data = anndata.read(origdatapath) data.obs = data.obs[["condition", "cell_type"]] data.uns = dict() data.obsm = None data.varm = None data.write(datapath) if not os.path.isfile(gtpath): cmd = ["wget", url_gt, "-O", gtpath] subprocess.check_call(cmd) if not os.path.isfile(gmtpath): response = requests.get(url_gmt) with open(gmtpath, "wt") as open_file: open_file.write(response.text) # Build dataset data = anndata.read(datapath) data.varm["annotations"] = load_annotations( gmtpath, data.var_names, min_genes=13) membership_mask = data.varm["annotations"].astype(bool).T logger.info("-- membership mask: {0}".format( membership_mask.values.shape)) trainset, testset = train_test_split( data.X, test_size=0.25, shuffle=True, random_state=random_state) logger.info("-- trainset: {0}".format(trainset.shape)) logger.info("-- testset: {0}".format(testset.shape)) return Item(data=data, trainset=trainset, testset=testset, membership_mask=membership_mask)
[docs]def load_annotations(gmt, genes, min_genes=10): genesets = parse_gmt(gmt, genes, min_genes) annotations = pd.DataFrame(False, index=genes, columns=genesets.keys()) for key, genes in genesets.items(): annotations.loc[genes, key] = True return annotations
[docs]def parse_gmt(path, symbols=None, min_genes=10): lut = dict() for line in open(path, "r"): key, _, *genes = line.strip().split() if symbols is not None: genes = symbols.intersection(genes).tolist() if len(genes) < min_genes: continue lut[key] = genes return lut

Follow us

© 2019, pynet developers .
Inspired by AZMIND template.