Helper Module for Deep Learning.
Source code for pynet.datasets.kang
# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2021
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################
"""
Module that provides functions to prepare the Kang dataset.
H. M. Kang, et al., Multiplexed droplet single-cell rna-sequencing
using natural genetic variation. Nature biotechnology, 2018.
"""
# Imports
import os
import json
import logging
import subprocess
import requests
from collections import namedtuple
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pynet.datasets import Fetchers
# Global parameters
Item = namedtuple("Item", ["data", "trainset", "testset", "membership_mask"])
logger = logging.getLogger("pynet")
[docs]@Fetchers.register
def fetch_kang(datasetdir, random_state=None):
""" Download the Kang dataset described in [1].
[1] H. M. Kang, et al., Multiplexed droplet single-cell rna-sequencing
using natural genetic variation.Nature biotechnology, 2018.
Parameters
----------
datasetdir: str
the dataset destination folder.
random_state: int, default None
controls the shuffling applied to the data before applying the split.
Returns
-------
item: namedtuple
a named tuple containing 'data', 'trainset', 'testset' and
'membership_mask'.
"""
logger.info("Loading Kang dataset.")
# Fisrt import specific modules
try:
import anndata
except:
raise ImportError("Please install anndata to use 'fetch_kang'.")
# Download resources
url_data = ("https://docs.google.com/uc?export=download&id=1-N7wPpYUf_"
"QcG5566WVZlaxVC90M7NNE")
url_gt = ("https://public.bmi.inf.ethz.ch/projects/2020/pmvae/"
"kang_recons.h5ad")
url_gmt = ("https://raw.githubusercontent.com/ratschlab/pmvae/main/data/"
"c2.cp.reactome.v4.0.symbols.gmt")
origdatapath = os.path.join(datasetdir, "orig_kang_count.h5ad")
datapath = os.path.join(datasetdir, "kang_count.h5ad")
gtpath = os.path.join(datasetdir, "kang_recons.h5ad")
gmtpath = os.path.join(datasetdir, "c2.cp.reactome.v4.0.symbols.gmt")
if not os.path.isdir(datasetdir):
os.mkdir(datasetdir)
if not os.path.isfile(origdatapath):
cmd = ["wget", "--no-check-certificate", url_data, "-O", origdatapath]
subprocess.check_call(cmd)
if not os.path.isfile(datapath):
data = anndata.read(origdatapath)
data.obs = data.obs[["condition", "cell_type"]]
data.uns = dict()
data.obsm = None
data.varm = None
data.write(datapath)
if not os.path.isfile(gtpath):
cmd = ["wget", url_gt, "-O", gtpath]
subprocess.check_call(cmd)
if not os.path.isfile(gmtpath):
response = requests.get(url_gmt)
with open(gmtpath, "wt") as open_file:
open_file.write(response.text)
# Build dataset
data = anndata.read(datapath)
data.varm["annotations"] = load_annotations(
gmtpath, data.var_names, min_genes=13)
membership_mask = data.varm["annotations"].astype(bool).T
logger.info("-- membership mask: {0}".format(
membership_mask.values.shape))
trainset, testset = train_test_split(
data.X, test_size=0.25, shuffle=True, random_state=random_state)
logger.info("-- trainset: {0}".format(trainset.shape))
logger.info("-- testset: {0}".format(testset.shape))
return Item(data=data, trainset=trainset, testset=testset,
membership_mask=membership_mask)
[docs]def load_annotations(gmt, genes, min_genes=10):
genesets = parse_gmt(gmt, genes, min_genes)
annotations = pd.DataFrame(False, index=genes, columns=genesets.keys())
for key, genes in genesets.items():
annotations.loc[genes, key] = True
return annotations
[docs]def parse_gmt(path, symbols=None, min_genes=10):
lut = dict()
for line in open(path, "r"):
key, _, *genes = line.strip().split()
if symbols is not None:
genes = symbols.intersection(genes).tolist()
if len(genes) < min_genes:
continue
lut[key] = genes
return lut
Follow us
© 2019, pynet developers .
Inspired by AZMIND template.
Inspired by AZMIND template.