Menu

Helper Module for Deep Learning.

Source code for pynet.datasets.euaims

# -*- coding: utf-8 -*-
########################################################################
# NSAp - Copyright (C) CEA, 2021
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
########################################################################

"""
Module provides functions to prepare different datasets from EUAIMS.
"""

# Imports
import os
import json
import time
import urllib
import shutil
import pickle
import requests
import logging
import numpy as np
from collections import namedtuple
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from pynet.datasets import Fetchers
from neurocombat_sklearn import CombatModel as fortin_combat
from nibabel.freesurfer.mghformat import load as surface_loader


# Global parameters
Item = namedtuple("Item", ["train_input_path", "test_input_path",
                           "train_metadata_path", "test_metadata_path"])
COHORT_NAME = "EUAIMS"
FOLDER = "/neurospin/brainomics/2020_deepint/data"
SAVING_FOLDER = "/tmp/EUAIMS"
FILES = {
    "stratification": os.path.join(FOLDER, "EUAIMS_stratification.tsv"),
    "rois_mapper": os.path.join(FOLDER, "EUAIMS_rois.tsv"),
    "surf_stratification": os.path.join(
        FOLDER, "EUAIMS_surf_stratification.tsv")
}

DEFAULTS = {
    "clinical": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True,
        "drop_cols": ["t1:site", "t1:ageyrs", "t1:sex", "t1:fsiq",
                      "t1:group", "t1:diagnosis", "mri", "t1:group:name",
                      "qc", "labels", "subgroups"],
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "rois": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True, "adjust_sites": True,
        "metrics": ["lgi:avg", "thick:avg", "surf:area"],
        "roi_types": ["cortical"],
        "residualize_by": {"continuous": ["t1:ageyrs", "t1:fsiq"],
                           "discrete": ["t1:sex"]},
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "genetic": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True, "scores": None,
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "surface": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True, "adjust_sites": True,
        "metrics": ["pial_lgi", "thickness"],
        "residualize_by": {"continuous": ["t1:ageyrs", "t1:fsiq"],
                           "discrete": ["t1:sex"]},
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "multiblock": {
        "test_size": 0.2, "seed": 42,
        "blocks": ["clinical", "surface-lh", "surface-rh", "genetic"],
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    }
}
logger = logging.getLogger("pynet")


[docs]def apply_qc(data, prefix, qc): """ applies quality control to the data Parameters ---------- data: pandas DataFrame data for which we control the quality prefix: string prefix of the column names qc: dict quality control dict. keys are the name of the columns to control on, and values dict containing an order relationsip and a value as items Returns ------- data: pandas DataFrame selected data by the quality control """ idx_to_keep = pd.Series([True] * len(data)) relation_mapper = { "gt": lambda x, y: x > y, "lt": lambda x, y: x < y, "gte": lambda x, y: x >= y, "lte": lambda x, y: x <= y, "eq": lambda x, y: x == y, } for name, controls in qc.items(): for relation, value in controls.items(): if relation not in relation_mapper.keys(): raise ValueError("The relationship {} provided is not a \ valid one".format(relation)) elif "{}{}".format(prefix, name) in data.columns: new_idx = relation_mapper[relation]( data["{}{}".format(prefix, name)], value) idx_to_keep = idx_to_keep & new_idx return data[idx_to_keep]
[docs]def fetch_clinical_wrapper(datasetdir=SAVING_FOLDER, files=FILES, cohort=COHORT_NAME, defaults=DEFAULTS['clinical']): """ Fetcher wrapper for clinical data Parameters ---------- datasetdir: string, default SAVING_FOLDER path to the folder in which to save the data files: dict, default FILES contains the paths to the different files cohort: string, default COHORT_NAME, name of the cohort subject_columns_name: string, default 'subjects' name of the column containing the subjects id defaults: dict, default DEFAULTS default values for the wrapped function Returns ------- fetcher: function corresponding fetcher. """ fetcher_name = "fetcher_clinical_{}".format(cohort) # @Fetchers.register def fetch_clinical( test_size=defaults["test_size"], seed=defaults["seed"], return_data=defaults["return_data"], z_score=defaults["z_score"], drop_cols=defaults["drop_cols"], qc=defaults["qc"]): """ Fetches and preprocesses clinical data Parameters ---------- test_size: float, default 0.2 proportion of the dataset to keep for testing. Preprocessing models will only be fitted on the training part and applied to the test set. You can specify not to use a testing set by setting it to 0 seed: int, default 42 random seed to split the data into train / test return_data: bool, default False If false, saves the data in the specified folder, and return the path. Otherwise, returns the preprocessed data and the corresponding subjects z_score: bool, default True wether or not to transform the data into z_scores, meaning standardizing and scaling it drop_cols: list of string, see default names of the columns to drop before saving the data. qc: dict, see default keys are the name of the features the control on, values are the requirements on their values (see the function apply_qc) Returns ------- item: namedtuple a named tuple containing 'train_input_path', 'train_metadata_path', and 'test_input_path', 'test_metadata_path' if test_size > 0 X_train: numpy array, Training data, if return_data is True X_test: numpy array, Test data, if return_data is True and test_size > 0 subj_train: numpy array, Training subjects, if return_data is True subj_test: numpy array, Test subjects, if return_data is True and test_size > 0 """ clinical_prefix = "bloc-clinical_score-" subject_column_name = "participant_id" path = os.path.join(datasetdir, "clinical_X_train.npy") meta_path = os.path.join(datasetdir, "clinical_X_train.tsv") path_test = None meta_path_test = None if test_size > 0: path_test = os.path.join(datasetdir, "clinical_X_test.npy") meta_path_test = os.path.join(datasetdir, "clinical_X_test.tsv") if not os.path.isfile(path): data = pd.read_csv(files["stratification"], sep="\t") clinical_cols = [subject_column_name] clinical_cols += [col for col in data.columns if col.startswith(clinical_prefix)] data = data[clinical_cols] data_train = apply_qc(data, clinical_prefix, qc).sort_values( subject_column_name) data_train.columns = [elem.replace(clinical_prefix, "") for elem in data_train.columns] X_train = data_train.drop(columns=drop_cols) # Splits in train and test and removes nans X_test, subj_test = (None, None) if test_size > 0: X_train, X_test = train_test_split( X_train, test_size=test_size, random_state=seed) na_idx_test = (X_test.isna().sum(1) == 0) X_test = X_test[na_idx_test] subj_test = X_test[subject_column_name].values X_test = X_test.drop(columns=[subject_column_name]).values na_idx_train = (X_train.isna().sum(1) == 0) X_train = X_train[na_idx_train] subj_train = X_train[subject_column_name].values X_train = X_train.drop(columns=[subject_column_name]) cols = X_train.columns X_train = X_train.values # Standardizes and scales if z_score: scaler = RobustScaler() X_train = scaler.fit_transform(X_train) _path = os.path.join(datasetdir, "clinical_scaler.pkl") with open(_path, "wb") as f: pickle.dump(scaler, f) if test_size > 0: X_test = scaler.transform(X_test) # Return data and subjects X_train_df = pd.DataFrame(data=X_train, columns=cols) X_train_df.insert(0, subject_column_name, subj_train) X_test_df = None if test_size > 0: X_test_df = pd.DataFrame(data=X_test, columns=cols) X_test_df.insert(0, subject_column_name, subj_test) # Saving np.save(path, X_train) X_train_df.to_csv(meta_path, index=False, sep="\t") if test_size > 0: np.save(path_test, X_test) X_test_df.to_csv(meta_path_test, index=False, sep="\t") if return_data: X_train = np.load(path) subj_train = pd.read_csv(meta_path, sep="\t")[ subject_column_name].values X_test, subj_test = (None, None) if test_size > 0: X_test = np.load(path_test) subj_test = pd.read_csv(meta_path_test, sep="\t")[ subject_column_name].values return X_train, X_test, subj_train, subj_test else: return Item(train_input_path=path, test_input_path=path_test, train_metadata_path=meta_path, test_metadata_path=meta_path_test) return fetch_clinical
[docs]def fetch_rois_wrapper(datasetdir=SAVING_FOLDER, files=FILES, cohort=COHORT_NAME, site_column_name="t1:site", defaults=DEFAULTS['rois']): """ Fetcher wrapper for rois data Parameters ---------- datasetdir: string, default SAVING_FOLDER path to the folder in which to save the data files: dict, default FILES contains the paths to the different files cohort: string, default COHORT_NAME, name of the cohort site_columns_name: string, default "t1:site" name of the column containing the site of MRI acquisition defaults: dict, default DEFAULTS default values for the wrapped function Returns ------- fetcher: function corresponding fetcher """ fetcher_name = "fetcher_rois_{}".format(cohort) # @Fetchers.register def fetch_rois( metrics=defaults["metrics"], roi_types=defaults["roi_types"], test_size=defaults["test_size"], seed=defaults["seed"], return_data=defaults["return_data"], z_score=defaults["z_score"], adjust_sites=defaults["adjust_sites"], residualize_by=defaults["residualize_by"], qc=defaults["qc"]): """ Fetches and preprocesses roi data Parameters ---------- datasetdir: string path to the folder in which to save the data metrics: list of strings, see default metrics to fetch roi_types: list of strings, default ["cortical"] type of rois to fetch. Must be one of "cortical", "subcortical" and "other" test_size: float, default 0.2 proportion of the dataset to keep for testing. Preprocessing models will only be fitted on the training part and applied to the test set. You can specify not to use a testing set by setting it to 0 seed: int, default 42 random seed to split the data into train / test return_data: bool, default False If false, saves the data in the specified folder, and return the path. Otherwise, returns the preprocessed data and the corresponding subjects z_score: bool, default True wether or not to transform the data into z_scores, meaning standardizing and scaling it adjust_sites: bool, default True wether or not the correct site effects via the Combat algorithm residualize_by: dict, see default variables to residualize the data. Two keys, "continuous" and "discrete", and the values are a list of the variable names qc: dict, see default keys are the name of the features the control on, values are the requirements on their values (see the function apply_qc) Returns ------- item: namedtuple a named tuple containing 'train_input_path', 'train_metadata_path', and 'test_input_path', 'test_metadata_path' if test_size > 0 X_train: numpy array, Training data, if return_data is True X_test: numpy array, Test data, if return_data is True and test_size > 0 subj_train: numpy array, Training subjects, if return_data is True subj_test: numpy array, Test subjects, if return_data is True and test_size > 0 """ clinical_prefix = "bloc-clinical_score-" roi_prefix = "bloc-t1w_roi-" subject_column_name = "participant_id" path = os.path.join(datasetdir, "rois_X_train.npy") meta_path = os.path.join(datasetdir, "rois_X_train.tsv") path_test = None meta_path_test = None if test_size > 0: path_test = os.path.join(datasetdir, "rois_X_test.npy") meta_path_test = os.path.join(datasetdir, "rois_X_test.tsv") if not os.path.isfile(path): data = pd.read_csv(files["stratification"], sep="\t") roi_mapper = pd.read_csv(files["rois_mapper"], sep="\t") # ROI selection roi_label_range = pd.Series([False] * len(roi_mapper)) for roi_type in roi_types: if roi_type == "cortical": roi_label_range = roi_label_range | ( (roi_mapper["labels"] > 11000) & (roi_mapper["labels"] < 13000)) elif roi_type == "subcortical": roi_label_range = roi_label_range | ( roi_mapper["labels"] > 13000) elif roi_type == "other": roi_label_range = roi_label_range | ( roi_mapper["labels"] < 11000) else: raise ValueError("Roi types must be either 'cortical', \ 'subcortical' or 'other'") roi_labels = roi_mapper.loc[roi_label_range, "labels"] # Feature selection features_list = [] for column in data.columns: if column.startswith(roi_prefix): roi = int(column.split(":")[1].split("_")[0]) metric = column.split("-")[-1] if roi in roi_labels.values and metric in metrics: features_list.append(column.replace(roi_prefix, "")) data_train = apply_qc(data, clinical_prefix, qc).sort_values( subject_column_name) data_train.columns = [elem.replace(roi_prefix, "") for elem in data_train.columns] X_train = data_train[features_list].copy() # Splits in train and test and removes nans if test_size > 0: X_train, X_test, data_train, data_test = train_test_split( X_train, data_train, test_size=test_size, random_state=seed) na_idx_test = (X_test.isna().sum(1) == 0) X_test = X_test[na_idx_test] data_test = data_test[na_idx_test] subj_test = data_test[subject_column_name].values na_idx_train = (X_train.isna().sum(1) == 0) X_train = X_train[na_idx_train] data_train = data_train[na_idx_train] subj_train = data_train[subject_column_name].values cols = X_train.columns # Correction for site effects if adjust_sites: for metric in metrics: adjuster = fortin_combat() features = [feature for feature in features_list if metric in feature] X_train[features] = adjuster.fit_transform( X_train[features], data_train[["{}{}".format( clinical_prefix, site_column_name)]], data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]], data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]]) _path = os.path.join( datasetdir, "rois_combat_{0}.pkl".format(metric)) with open(_path, "wb") as of: pickle.dump(adjuster, of) if test_size > 0: X_test[features] = adjuster.transform( X_test[features], data_test[["{}{}".format( clinical_prefix, site_column_name)]], data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]], data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]]) # Standardizes if z_score: scaler = RobustScaler() X_train = scaler.fit_transform(X_train) _path = os.path.join(datasetdir, "rois_scaler.pkl") with open(_path, "wb") as f: pickle.dump(scaler, f) if test_size > 0: X_test = scaler.transform(X_test) else: X_train = X_train.values if test_size > 0: X_test = X_test.values # Residualizes and scales if residualize_by is not None or len(residualize_by) > 0: regressor = LinearRegression() y_train = np.concatenate([ data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]].values, OneHotEncoder(sparse=False).fit_transform( data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]]) ], axis=1) regressor.fit(y_train, X_train) X_train = X_train - regressor.predict(y_train) _path = os.path.join(datasetdir, "rois_residualizer.pkl") with open(_path, "wb") as f: pickle.dump(regressor, f) if test_size > 0: y_test = np.concatenate([ data_test[[ "{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]].values, OneHotEncoder(sparse=False).fit_transform( data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]]) ], axis=1) X_test = X_test - regressor.predict(y_test) # Return data and subjects X_train_df = pd.DataFrame(data=X_train, columns=cols) X_train_df.insert(0, subject_column_name, subj_train) X_test_df = None if test_size > 0: X_test_df = pd.DataFrame(data=X_test, columns=cols) X_test_df.insert(0, subject_column_name, subj_test) # Saving np.save(path, X_train) X_train_df.to_csv(meta_path, index=False, sep="\t") if test_size > 0: np.save(path_test, X_test) X_test_df.to_csv(meta_path_test, index=False, sep="\t") if return_data: X_train = np.load(path) subj_train = pd.read_csv(meta_path, sep="\t")[ subject_column_name].values X_test, subj_test = (None, None) if test_size > 0: X_test = np.load(path_test) subj_test = pd.read_csv(meta_path_test, sep="\t")[ subject_column_name].values return X_train, X_test, subj_train, subj_test else: return Item(train_input_path=path, test_input_path=path_test, train_metadata_path=meta_path, test_metadata_path=meta_path_test) return fetch_rois
[docs]def fetch_surface_wrapper(hemisphere, datasetdir=SAVING_FOLDER, files=FILES, cohort=COHORT_NAME, site_column_name="t1:site", defaults=DEFAULTS["surface"]): """ Fetcher wrapper for surface data Parameters ---------- hemisphere: string name of the hemisphere data fetcher, one of "rh" or "lh" datasetdir: string, default SAVING_FOLDER path to the folder in which to save the data files: dict, default FILES contains the paths to the different files cohort: string, default COHORT_NAME, name of the cohort site_columns_name: string, default "t1:site" name of the column containing the site of MRI acquisition defaults: dict, default DEFAULTS default values for the wrapped function Returns ------- fetcher: function corresponding fetcher """ assert(hemisphere in ["rh", "lh"]) fetcher_name = "fetcher_surface_{}_{}".format(hemisphere, cohort) # @Fetchers.register def fetch_surface( metrics=defaults["metrics"], test_size=defaults["test_size"], seed=defaults["seed"], return_data=defaults["return_data"], z_score=defaults["z_score"], adjust_sites=defaults["adjust_sites"], residualize_by=defaults["residualize_by"], qc=defaults["qc"]): """ Fetches and preprocesses surface data Parameters ---------- metrics: list of strings, see defaults metrics to fetch test_size: float, default 0.2 proportion of the dataset to keep for testing. Preprocessing models will only be fitted on the training part and applied to the test set. You can specify not to use a testing set by setting it to 0 seed: int, default 42 random seed to split the data into train / test return_data: bool, default False If false, saves the data in the specified folder, and return the path. Otherwise, returns the preprocessed data and the corresponding subjects z_score: bool, default True wether or not to transform the data into z_scores, meaning standardizing and scaling it adjust_sites: bool, default True wether or not the correct site effects via the Combat algorithm residualize_by: dict, see default variables to residualize the data. Two keys, "continuous" and "discrete", and the values are a list of the variable names qc: dict, see default keys are the name of the features the control on, values are the requirements on their values (see the function apply_qc) Returns ------- item: namedtuple a named tuple containing 'train_input_path', 'train_metadata_path', and 'test_input_path', 'test_metadata_path' if test_size > 0 X_train: numpy array, Training data, if return_data is True X_test: numpy array, Test data, if return_data is True and test_size > 0 subj_train: numpy array, Training subjects, if return_data is True subj_test: numpy array, Test subjects, if return_data is True and test_size > 0 """ clinical_prefix = "bloc-clinical_score-" surf_prefix = "bloc-t1w_hemi-{}_metric".format(hemisphere) data = pd.read_csv(files["clinical_surface"], sep="\t").drop( columns=["bloc-t1w_hemi-lh_metric-area", "bloc-t1w_hemi-rh_metric-area"]) # Feature selection features_list = [] for metric in metrics: for column in data.columns: if column.startswith(surf_prefix): m = column.split('-')[-1] if m == metric: features_list.append(column) data_train = apply_qc(data, clinical_prefix, qc).sort_values( "participant_id") # Loads surface data n_vertices = len( surface_loader(data_train[features_list[0]].iloc[0]).get_data()) X_train = np.zeros((len(data_train), n_vertices, len(features_list))) for i in range(len(data_train)): for j, feature in enumerate(features_list): path = data_train[feature].iloc[i] if not pd.isnull([path]): X_train[i, :, j] = surface_loader( path).get_data().squeeze() # Splits in train and test and removes nans if test_size > 0: X_train, X_test, data_train, data_test = train_test_split( X_train, data_train, test_size=test_size, random_state=seed) na_idx_test = (np.isnan(X_test).sum((1, 2)) == 0) X_test = X_test[na_idx_test] data_test = data_test[na_idx_test] if return_data: subj_test = data_test["participant_id"].values na_idx_train = (np.isnan(X_train).sum((1, 2)) == 0) X_train = X_train[na_idx_train] data_train = data_train[na_idx_train] if return_data: subj_train = data_train["participant_id"].values # Applies feature-wise preprocessing for i, feature in enumerate(features_list): # Correction for site effects if adjust_sites: non_zeros_idx = (X_train[:, :, i] > 0).sum(0) >= 1 adjuster = fortin_combat() X_train[:, non_zeros_idx, i] = adjuster.fit_transform( X_train[:, non_zeros_idx, i], data_train[["{}{}".format( clinical_prefix, site_column_name)]], data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]], data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]]) path = os.path.join( datasetdir, "surface_{}_combat_feature{}.pkl".format(hemisphere, i)) with open(path, "wb") as f: pickle.dump(adjuster, f) if test_size > 0: X_test[:, non_zeros_idx, i] = adjuster.transform( X_test[:, non_zeros_idx, i], data_test[["{}{}".format( clinical_prefix, site_column_name)]], data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]], data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]]) # Standardizes and scales if z_score: scaler = RobustScaler() X_train[:, :, i] = scaler.fit_transform(X_train[:, :, i]) path = os.path.join( datasetdir, "surface_{}_scaler_feature{}.pkl".format(hemisphere, i)) with open(path, "wb") as f: pickle.dump(scaler, f) if test_size > 0: X_test[:, :, i] = scaler.transform(X_test[:, :, i]) # Residualizes if residualize_by is not None or len(residualize_by) > 0: regressor = LinearRegression() y_train = np.concatenate([ data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]]].values, OneHotEncoder(sparse=False).fit_transform( data_train[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]]) ], axis=1) regressor.fit(y_train, X_train[:, :, i]) X_train[:, :, i] = X_train[:, :, i] - regressor.predict( y_train) path = os.path.join( datasetdir, "surface_{}_residualizer_feature{}.pkl".format( hemisphere, i)) with open(path, "wb") as f: pickle.dump(regressor, f) if test_size > 0: y_test = np.concatenate([ data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["continuous"]] ].values, OneHotEncoder(sparse=False).fit_transform( data_test[["{}{}".format(clinical_prefix, f) for f in residualize_by["discrete"]]]) ], axis=1) X_test[:, :, i] = X_test[:, :, i] - regressor.predict( y_test) # Returns data and subjects if return_data: if test_size > 0: return X_train, X_test, subj_train, subj_test return X_train, subj_train # Saving path = os.path.join( datasetdir, "surface_{}_X_train.npy".format(hemisphere)) np.save(path, X_train) if test_size > 0: path_test = os.path.join( datasetdir, "surface_{}_X_test.npy".format(hemisphere)) np.save(path_test, X_test) return path, path_test return path return fetch_surface
[docs]def fetch_genetic_wrapper(datasetdir=SAVING_FOLDER, files=FILES, cohort=COHORT_NAME, defaults=DEFAULTS['genetic']): """ Fetcher wrapper for genetic data Parameters ---------- datasetdir: string, default SAVING_FOLDER path to the folder in which to save the data files: dict, default FILES contains the paths to the different files cohort: string, default COHORT_NAME, name of the cohort defaults: dict, default DEFAULTS default values for the wrapped function Returns ------- fetcher: function corresponding fetcher """ fetcher_name = "fetcher_genetic_{}".format(cohort) # @Fetchers.register def fetch_genetic( scores=defaults["scores"], test_size=defaults["test_size"], seed=defaults["seed"], return_data=defaults["return_data"], z_score=defaults["z_score"], qc=defaults["qc"]): """ Fetches and preprocesses genetic data Parameters ---------- scores: list of strings, see defaults scores to fetch, None mean it fetches all the available scores test_size: float, see defaults proportion of the dataset to keep for testing. Preprocessing models will only be fitted on the training part and applied to the test set. You can specify not to use a testing set by setting it to 0 seed: int, see default random seed to split the data into train / test return_data: bool, default False If false, saves the data in the specified folder, and return the path. Otherwise, returns the preprocessed data and the corresponding subjects z_score: bool, see defaults wether or not to transform the data into z_scores, meaning standardizing and scaling it qc: dict, see defaults keys are the name of the features the control on, values are the requirements on their values (see the function apply_qc) Returns ------- item: namedtuple a named tuple containing 'train_input_path', 'train_metadata_path', and 'test_input_path', 'test_metadata_path' if test_size > 0 X_train: numpy array Training data, if return_data is True X_test: numpy array Test data, if return_data is True and test_size > 0 subj_train: numpy array Training subjects, if return_data is True subj_test: numpy array Test subjects, if return_data is True and test_size > 0 """ clinical_prefix = "bloc-clinical_score-" genetic_prefix = "bloc-genetic_score-" subject_column_name = "participant_id" path = os.path.join(datasetdir, "genetic_X_train.npy") meta_path = os.path.join(datasetdir, "genetic_X_train.tsv") path_test = None meta_path_test = None if test_size > 0: path_test = os.path.join(datasetdir, "genetic_X_test.npy") meta_path_test = os.path.join(datasetdir, "genetic_X_test.tsv") if not os.path.isfile(path): data = pd.read_csv(files["stratification"], sep="\t") # Feature selection features_list = [] for column in data.columns: if column.startswith(genetic_prefix): score = column.split("-")[-1] if scores is not None and score in scores: features_list.append( column.replace(genetic_prefix, "")) elif scores is None: features_list.append( column.replace(genetic_prefix, "")) data_train = apply_qc(data, clinical_prefix, qc).sort_values( subject_column_name) data_train.columns = [elem.replace(genetic_prefix, "") for elem in data_train.columns] X_train = data_train[features_list].copy() # Splits in train and test and removes nans if test_size > 0: X_train, X_test, data_train, data_test = train_test_split( X_train, data_train, test_size=test_size, random_state=seed) na_idx_test = (X_test.isna().sum(1) == 0) X_test = X_test[na_idx_test] data_test = data_test[na_idx_test] subj_test = data_test[subject_column_name].values na_idx_train = (X_train.isna().sum(1) == 0) X_train = X_train[na_idx_train] data_train = data_train[na_idx_train] subj_train = data_train[subject_column_name].values cols = X_train.columns # Standardizes and scales if z_score: scaler = RobustScaler() X_train = scaler.fit_transform(X_train) _path = os.path.join(datasetdir, "genetic_scaler.pkl") with open(_path, "wb") as f: pickle.dump(scaler, f) if test_size > 0: X_test = scaler.transform(X_test) else: X_train = X_train.values if test_size > 0: X_test = X_test.values # Return data and subjects X_train_df = pd.DataFrame(data=X_train, columns=cols) X_train_df.insert(0, subject_column_name, subj_train) X_test_df = None if test_size > 0: X_test_df = pd.DataFrame(data=X_test, columns=cols) X_test_df.insert(0, subject_column_name, subj_test) # Saving np.save(path, X_train) X_train_df.to_csv(meta_path, index=False, sep="\t") if test_size > 0: np.save(path_test, X_test) X_test_df.to_csv(meta_path_test, index=False, sep="\t") if return_data: X_train = np.load(path) subj_train = pd.read_csv(meta_path, sep="\t")[ subject_column_name].values X_test, subj_test = (None, None) if test_size > 0: X_test = np.load(path_test) subj_test = pd.read_csv(meta_path_test, sep="\t")[ subject_column_name].values return X_train, X_test, subj_train, subj_test else: return Item(train_input_path=path, test_input_path=path_test, train_metadata_path=meta_path, test_metadata_path=meta_path_test) return fetch_genetic
[docs]def make_fetchers(datasetdir=SAVING_FOLDER): return { "clinical": fetch_clinical_wrapper(datasetdir=datasetdir), "rois": fetch_rois_wrapper(datasetdir=datasetdir), "surface-rh": fetch_surface_wrapper(hemisphere="rh", datasetdir=datasetdir), "surface-lh": fetch_surface_wrapper(hemisphere="lh", datasetdir=datasetdir), "genetic": fetch_genetic_wrapper(datasetdir=datasetdir), }
[docs]def fetch_multiblock_wrapper(datasetdir=SAVING_FOLDER, files=FILES, cohort=COHORT_NAME, subject_column_name="subjects", defaults=DEFAULTS["multiblock"], make_fetchers_func=make_fetchers): """ Fetcher wrapper for multiblock data Parameters ---------- datasetdir: string, default SAVING_FOLDER path to the folder in which to save the data files: dict, default FILES contains the paths to the different files cohort: string, default COHORT_NAME, name of the cohort subject_columns_name: string, default "subjects" name of the column containing the subjects id defaults: dict, default DEFAULTS default values for the wrapped function make_fetchers_func: function, default make_fetchers function to build the fetchers from their wrappers. Must return a dict containing as keys the name of the channels, and values the corresponding fetcher Returns ------- fetcher: function corresponding fetcher """ fetcher_name = "fetcher_multiblock_{}".format(cohort) FETCHERS = make_fetchers_func(datasetdir) # @Fetchers.register def fetch_multiblock( blocks=defaults["blocks"], test_size=defaults["test_size"], seed=defaults["seed"], qc=defaults["qc"], **kwargs): """ Fetches and preprocesses multi block data Parameters ---------- blocks: list of strings, see default blocks of data to fetch, all must be in the key list of FETCHERS test_size: float, default 0.2 proportion of the dataset to keep for testing. Preprocessing models will only be fitted on the training part and applied to the test set. You can specify not to use a testing set by setting it to 0 seed: int, default 42 random seed to split the data into train / test qc: dict, see default keys are the name of the features the control on, values are the requirements on their values (see the function apply_qc) kwargs: dict additional arguments to be passed to each fetcher indivudally. Keys are the name of the fetchers, and values are a dictionnary containing arguments and the values for this fetcher Returns ------- item: namedtuple a named tuple containing 'train_input_path', 'train_metadata_path', and 'test_input_path', 'test_metadata_path' if test_size > 0 """ path = os.path.join(datasetdir, "multiblock_X_train.npz") metadata_path = os.path.join(datasetdir, "metadata_train.tsv") path_test = None metadata_path_test = None if test_size > 0: path_test = os.path.join(datasetdir, "multiblock_X_test.npz") metadata_path_test = os.path.join( datasetdir, "metadata_test.tsv") if not os.path.isfile(path): X_train = {} subj_train = {} if test_size > 0: X_test = {} subj_test = {} for block in blocks: assert block in FETCHERS.keys() if block in kwargs.keys(): local_kwargs = kwargs[block] # Impose to have the same qc steps and splitting train/test # over all the blocks to have the same subjects for key, value in local_kwargs.items(): if key in ["qc", "test_size", "seed"]: del local_kwargs[key] else: local_kwargs = {} new_X_train, new_X_test, new_subj_train, new_subj_test = \ FETCHERS[block]( qc=qc, test_size=test_size, seed=seed, return_data=True, **local_kwargs) if test_size > 0: X_test[block] = new_X_test subj_test[block] = new_subj_test X_train[block] = new_X_train subj_train[block] = new_subj_train # Remove subjects that arent in all the channels common_subjects_train = list( set.intersection(*map(set, subj_train.values()))) for block in blocks: subjects = subj_train[block] assert(len(subjects) == len(X_train[block])) idx_to_keep = [ _idx for _idx in range(len(subjects)) if subjects[_idx] in common_subjects_train] X_train[block] = X_train[block][idx_to_keep] if test_size > 0: common_subjects_test = list( set.intersection(*map(set, subj_test.values()))) for block in blocks: subjects = subj_test[block] assert(len(subjects) == len(X_test[block])) idx_to_keep = [ _idx for _idx in range(len(subjects)) if subjects[_idx] in common_subjects_test] X_test[block] = X_test[block][idx_to_keep] # Loads metadata clinical_prefix = "bloc-clinical_score-" metadata_cols = ["participant_id", "labels", "subgroups"] metadata = pd.read_csv(files["stratification"], sep="\t") clinical_cols = ["participant_id"] clinical_cols += [col for col in metadata.columns if col.startswith(clinical_prefix)] metadata = metadata[clinical_cols] metadata.columns = [elem.replace(clinical_prefix, "") for elem in metadata.columns] metadata = metadata[metadata_cols] metadata_train = metadata[ metadata[subject_column_name].isin(common_subjects_train)] if test_size > 0: metadata_test = metadata[ metadata[subject_column_name].isin(common_subjects_test)] # Saving np.savez(path, **X_train) metadata_train.to_csv(metadata_path, index=False, sep="\t") if test_size > 0: np.savez(path_test, **X_test) metadata_test.to_csv(metadata_path_test, index=False, sep="\t") return Item(train_input_path=path, test_input_path=path_test, train_metadata_path=metadata_path, test_metadata_path=metadata_path_test) return fetch_multiblock
WRAPPERS = { "clinical": fetch_clinical_wrapper, "rois": fetch_rois_wrapper, "genetic": fetch_genetic_wrapper, "surface": fetch_surface_wrapper, "multiblock": fetch_multiblock_wrapper, }
[docs]def fetch_multiblock_euaims(datasetdir=SAVING_FOLDER, fetchers=make_fetchers, surface=False): if surface: DEFAULTS["multiblock"]["blocks"] = ["clinical", "surface-lh", "surface-rh", "genetic"] else: DEFAULTS["multiblock"]["blocks"] = ["clinical", "rois", "genetic"] return WRAPPERS["multiblock"]( datasetdir=datasetdir, files=FILES, cohort=COHORT_NAME, subject_column_name="participant_id", defaults=DEFAULTS["multiblock"], make_fetchers_func=make_fetchers)()
[docs]def inverse_normalization(data, scalers): """ De-normalize a dataset. """ for scaler_path in scalers: with open(scaler_path, "rb") as of: scaler = pickle.load(of) data = scaler.inverse_transform(data) return data

Follow us

© 2019, pynet developers .
Inspired by AZMIND template.