Helper Module for Deep Learning.
Source code for pynet.datasets.genomic
# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2019
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################
"""
Module that provides functions to prepare geomic dataset.
"""
# Imports
import os
import json
import urllib
import shutil
import requests
import logging
import numpy as np
from collections import namedtuple
import pandas as pd
from pynet.datasets import Fetchers
# Global parameters
Item = namedtuple("Item", ["input_path", "output_path", "metadata_path",
"labels"])
URLS = [
("https://raw.githubusercontent.com/miguelperezenciso/DLpipeline/master/"
"DATA/wheat.X"),
("https://raw.githubusercontent.com/miguelperezenciso/DLpipeline/master/"
"DATA/wheat.Y"),
]
logger = logging.getLogger("pynet")
[docs]@Fetchers.register
def fetch_genomic_pred(datasetdir, to_categorical=False):
""" Fetch/prepare the genomic prediction dataset for pynet.
Matrix Y contains the average grain yield, column 1: Grain yield for
environment 1 and so on.
Matrix X contains marker genotypes.
Parameters
----------
datasetdir: str
the dataset destination folder.
to_categorical: bool, default False
if set convert the observation to categories.
Returns
-------
item: namedtuple
a named tuple containing 'input_path', 'output_path', and
'metadata_path'.
"""
logger.info("Loading genomic prediction dataset.")
if not os.path.isdir(datasetdir):
os.mkdir(datasetdir)
desc_path = os.path.join(datasetdir, "pynet_genomic_pred.tsv")
desc_categorical_path = os.path.join(
datasetdir, "pynet_genomic_categorical_pred.tsv")
input_path = os.path.join(datasetdir, "pynet_genomic_pred_inputs.npy")
if not os.path.isfile(desc_path):
for cnt, url in enumerate(URLS):
logger.debug("Processing {0}...".format(url))
basename = url.split(os.sep)[-1]
datafile = os.path.join(datasetdir, basename)
if not os.path.isfile(datafile):
response = requests.get(url, stream=True)
with open(datafile, "wt") as out_file:
out_file.write(response.text)
del response
else:
logger.debug(
"Data '{0}' already downloaded.".format(datafile))
data_x = pd.read_csv(
os.path.join(datasetdir, "wheat.X"), header=None, sep="\s+")
data_y = pd.read_csv(
os.path.join(datasetdir, "wheat.Y"), header=None, sep="\s+")
logger.debug("Data X: {0}".format(data_x.shape))
logger.debug("Data Y: {0}".format(data_y.shape))
np.save(input_path, data_x.values.astype(float))
data_y = data_y.values.T
metadata = dict(("env{0}".format(idx), val)
for idx, val in enumerate(data_y))
df = pd.DataFrame.from_dict(metadata)
data_y_cat = [
(np.round(val - np.min(val)) / 2).astype(int) for val in data_y]
metadata = {}
for idx, env in enumerate(data_y_cat):
metadata["env{0}".format(idx)] = env
labels = np.unique(env)
env_cat = pd.get_dummies(env).values
for key, val in zip(labels, env_cat.T):
metadata["env{0}_cat{1}".format(idx, key)] = val
cat_df = pd.DataFrame.from_dict(metadata)
df.to_csv(desc_path, sep="\t", index=False)
cat_df.to_csv(desc_categorical_path, sep="\t", index=False)
desc_path = desc_categorical_path if to_categorical else desc_path
return Item(input_path=input_path, output_path=None,
metadata_path=desc_path, labels=None)
Follow us
© 2019, pynet developers .
Inspired by AZMIND template.
Inspired by AZMIND template.