Menu

Helper Module for Deep Learning.

Source code for pynet.datasets.ukb

# -*- coding: utf-8 -*-
########################################################################
# NSAp - Copyright (C) CEA, 2019
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
########################################################################

"""
Module provides functions to prepare different toy datasets from UKB.
  1) toy example about height in UKB with no NaN and known signif snps
  2)
"""

# Imports
import os
import json
import urllib
import shutil
import requests
import logging
import numpy as np
from collections import namedtuple
import pandas as pd
import sklearn
from pynet.datasets import Fetchers


# Global parameters
Item = namedtuple("Item", ["input_path", "output_path", "metadata_path",
                           "labels"])

FILES = [
    ("/neurospin/ukb/derivatives/brainomics_multivariate/"
     "toy_height.phe"),
    ("/neurospin/ukb/derivatives/brainomics_multivariate/"
     "toy_age_sex.cov"),
    ("/neurospin/ukb/derivatives/brainomics_multivariate/"
     "toy_chr19_chunk7_nonan.npz"),
    ("/neurospin/ukb/derivatives/brainomics_multivariate/"
     "toy_chr19_chunk7_nonan.check"),
]
MSG = (
    "See https://gitlab.com/brainomics/brainomics_notebooks "
    "and the notebook "
    "notebooks/studies/HEIGHT_UKB_DeepLearning.ipynb"
)

logger = logging.getLogger("pynet")


[docs]@Fetchers.register def fetch_height_biobank(datasetdir, to_categorical=False, check=False): """ Fetch/prepare the height biobank prediction dataset for pynet. Matrix Y contains the average grain yield, column 1: Grain yield for environment 1 and so on. Matrix X contains marker genotypes. Parameters ---------- datasetdir: str the dataset destination folder. to_categorical: bool, default False if set convert the observation to categories. check: bool, default False if set check results against the downloaded .check file data Returns ------- item: namedtuple a named tuple containing 'input_path', 'output_path', and 'metadata_path'. """ logger.info("Loading UK BioBank height dataset.") if not os.path.isdir(datasetdir): os.mkdir(datasetdir) desc_path = os.path.join(datasetdir, "pynet_bb_height_pred.tsv") desc_categorical_path = os.path.join( datasetdir, "pynet_bb_height_categorical_pred.tsv") input_path = os.path.join(datasetdir, "pynet_bb_height_pred_inputs.npy") file_todel = [] if not os.path.isfile(desc_path): for cnt, fname in enumerate(FILES): logger.debug("Processing {0}...".format(fname)) basename = fname.split(os.sep)[-1] datafile = os.path.join(datasetdir, basename) if not os.path.isfile(datafile): shutil.copy(fname, datafile) file_todel.append(datafile) else: logger.debug( "Data '{0}' already downloaded.".format(datafile)) # Get data_x, read from the chunk #7 nan filteredout data_x = np.load(os.path.join(datasetdir, "toy_chr19_chunk7_nonan.npz"), allow_pickle=True )['genotype'] logger.info("Data X: {0}".format(data_x.shape)) # Get data_y # Cosmetics cov = pd.read_csv( os.path.join(datasetdir, "toy_age_sex.cov"), sep="\t") data_y = pd.read_csv( os.path.join(datasetdir, "toy_height.phe"), sep="\t") data_y.drop(['FID', 'IID'], axis=1, inplace=True) cov.drop(['FID', 'IID'], axis=1, inplace=True) logger.info("Data Y: {0}".format(data_y.shape)) # residualize logger.info("Residualize Data Y") import statsmodels.api as sm y = data_y.values X = cov.values X = sm.add_constant(X) model = sm.OLS(y, X, missing='drop') results = model.fit() y_res = y - results.predict(X).reshape(-1, 1) data_y['Height'] = y_res data_y['HeightCat'] = pd.qcut(data_y.Height, q=3, labels=[1, 2, 3]) tmpdf = pd.get_dummies(data_y.HeightCat) d = {} for i in tmpdf.columns: d[i] = "Height_{}".format(i) tmpdf.rename(d, axis='columns', inplace=True) data_y = pd.concat([data_y, tmpdf], axis=1) if check: # check data coherence # check shapes assert (data_x.shape[0] == cov.shape[0]) assert (data_x.shape[0] == data_y.shape[0]) # check unvariate SNP p wrt check file pvals_res = [] for idx in range(data_x.shape[1]): y = y_res X = data_x[:, idx].reshape(-1, 1) X = sm.add_constant(X) model = sm.OLS(y, X, missing='drop') results_res = model.fit() pvals_res.append((results_res.pvalues[0])) pvals_res = np.array(pvals_res) ref = pd.read_csv( os.path.join(datasetdir, "toy_chr19_chunk7_nonan.check"), sep="\t") ref['runtimeP'] = pvals_res ref.sort_values('P from residual').head(20) np.testing.assert_almost_equal( ref['runtimeP'].tolist(), ref['P from residual'].tolist() ) # now data_y colomns are Height, HeightCat, HeigthCat_0, .. maskcolumns = data_y.columns.tolist() maskcolumns.remove('Height') logger.info("Save Data Y") data_y[['Height']].to_csv(desc_path, sep="\t", index=False) logger.info("Save Data Y (categorical)") data_y[maskcolumns].to_csv(desc_categorical_path, sep="\t", index=False) logger.info("Save Data X") np.save(input_path, data_x.astype(float)) # Housekeeping desc_path = desc_categorical_path if to_categorical else desc_path for f in file_todel: os.remove(f) return Item(input_path=input_path, output_path=None, metadata_path=desc_path, labels=None)

Follow us

© 2019, pynet developers .
Inspired by AZMIND template.