Helper Module for Deep Learning.
Source code for pynet.datasets.ukb
# -*- coding: utf-8 -*-
########################################################################
# NSAp - Copyright (C) CEA, 2019
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
########################################################################
"""
Module provides functions to prepare different toy datasets from UKB.
1) toy example about height in UKB with no NaN and known signif snps
2)
"""
# Imports
import os
import json
import urllib
import shutil
import requests
import logging
import numpy as np
from collections import namedtuple
import pandas as pd
import sklearn
from pynet.datasets import Fetchers
# Global parameters
Item = namedtuple("Item", ["input_path", "output_path", "metadata_path",
"labels"])
FILES = [
("/neurospin/ukb/derivatives/brainomics_multivariate/"
"toy_height.phe"),
("/neurospin/ukb/derivatives/brainomics_multivariate/"
"toy_age_sex.cov"),
("/neurospin/ukb/derivatives/brainomics_multivariate/"
"toy_chr19_chunk7_nonan.npz"),
("/neurospin/ukb/derivatives/brainomics_multivariate/"
"toy_chr19_chunk7_nonan.check"),
]
MSG = (
"See https://gitlab.com/brainomics/brainomics_notebooks "
"and the notebook "
"notebooks/studies/HEIGHT_UKB_DeepLearning.ipynb"
)
logger = logging.getLogger("pynet")
[docs]@Fetchers.register
def fetch_height_biobank(datasetdir, to_categorical=False, check=False):
""" Fetch/prepare the height biobank prediction dataset for pynet.
Matrix Y contains the average grain yield, column 1: Grain yield for
environment 1 and so on.
Matrix X contains marker genotypes.
Parameters
----------
datasetdir: str
the dataset destination folder.
to_categorical: bool, default False
if set convert the observation to categories.
check: bool, default False
if set check results against the downloaded .check file data
Returns
-------
item: namedtuple
a named tuple containing 'input_path', 'output_path', and
'metadata_path'.
"""
logger.info("Loading UK BioBank height dataset.")
if not os.path.isdir(datasetdir):
os.mkdir(datasetdir)
desc_path = os.path.join(datasetdir, "pynet_bb_height_pred.tsv")
desc_categorical_path = os.path.join(
datasetdir, "pynet_bb_height_categorical_pred.tsv")
input_path = os.path.join(datasetdir, "pynet_bb_height_pred_inputs.npy")
file_todel = []
if not os.path.isfile(desc_path):
for cnt, fname in enumerate(FILES):
logger.debug("Processing {0}...".format(fname))
basename = fname.split(os.sep)[-1]
datafile = os.path.join(datasetdir, basename)
if not os.path.isfile(datafile):
shutil.copy(fname, datafile)
file_todel.append(datafile)
else:
logger.debug(
"Data '{0}' already downloaded.".format(datafile))
# Get data_x, read from the chunk #7 nan filteredout
data_x = np.load(os.path.join(datasetdir,
"toy_chr19_chunk7_nonan.npz"),
allow_pickle=True
)['genotype']
logger.info("Data X: {0}".format(data_x.shape))
# Get data_y
# Cosmetics
cov = pd.read_csv(
os.path.join(datasetdir, "toy_age_sex.cov"), sep="\t")
data_y = pd.read_csv(
os.path.join(datasetdir, "toy_height.phe"), sep="\t")
data_y.drop(['FID', 'IID'], axis=1, inplace=True)
cov.drop(['FID', 'IID'], axis=1, inplace=True)
logger.info("Data Y: {0}".format(data_y.shape))
# residualize
logger.info("Residualize Data Y")
import statsmodels.api as sm
y = data_y.values
X = cov.values
X = sm.add_constant(X)
model = sm.OLS(y, X, missing='drop')
results = model.fit()
y_res = y - results.predict(X).reshape(-1, 1)
data_y['Height'] = y_res
data_y['HeightCat'] = pd.qcut(data_y.Height, q=3, labels=[1, 2, 3])
tmpdf = pd.get_dummies(data_y.HeightCat)
d = {}
for i in tmpdf.columns:
d[i] = "Height_{}".format(i)
tmpdf.rename(d, axis='columns', inplace=True)
data_y = pd.concat([data_y, tmpdf], axis=1)
if check:
# check data coherence
# check shapes
assert (data_x.shape[0] == cov.shape[0])
assert (data_x.shape[0] == data_y.shape[0])
# check unvariate SNP p wrt check file
pvals_res = []
for idx in range(data_x.shape[1]):
y = y_res
X = data_x[:, idx].reshape(-1, 1)
X = sm.add_constant(X)
model = sm.OLS(y, X, missing='drop')
results_res = model.fit()
pvals_res.append((results_res.pvalues[0]))
pvals_res = np.array(pvals_res)
ref = pd.read_csv(
os.path.join(datasetdir,
"toy_chr19_chunk7_nonan.check"),
sep="\t")
ref['runtimeP'] = pvals_res
ref.sort_values('P from residual').head(20)
np.testing.assert_almost_equal(
ref['runtimeP'].tolist(),
ref['P from residual'].tolist()
)
# now data_y colomns are Height, HeightCat, HeigthCat_0, ..
maskcolumns = data_y.columns.tolist()
maskcolumns.remove('Height')
logger.info("Save Data Y")
data_y[['Height']].to_csv(desc_path, sep="\t", index=False)
logger.info("Save Data Y (categorical)")
data_y[maskcolumns].to_csv(desc_categorical_path,
sep="\t", index=False)
logger.info("Save Data X")
np.save(input_path, data_x.astype(float))
# Housekeeping
desc_path = desc_categorical_path if to_categorical else desc_path
for f in file_todel:
os.remove(f)
return Item(input_path=input_path, output_path=None,
metadata_path=desc_path, labels=None)
Follow us
© 2019, pynet developers .
Inspired by AZMIND template.
Inspired by AZMIND template.