Helper Module for Deep Learning.

Source code for pynet.datasets.euaims

# -*- coding: utf-8 -*-
########################################################################
# NSAp - Copyright (C) CEA, 2021
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
########################################################################

"""
Module provides functions to prepare different datasets from EUAIMS.
"""

# Imports
import os
import json
import time
import urllib
import shutil
import pickle
import requests
import logging
import numpy as np
from collections import namedtuple
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from pynet.datasets import Fetchers
from neurocombat_sklearn import CombatModel as fortin_combat
from nibabel.freesurfer.mghformat import load as surface_loader


# Global parameters
Item = namedtuple("Item", ["train_input_path", "test_input_path",
                           "train_metadata_path", "test_metadata_path"])
COHORT_NAME = "EUAIMS"
FOLDER = "/neurospin/brainomics/2020_deepint/data"
SAVING_FOLDER = "/tmp/EUAIMS"
FILES = {
    "stratification": os.path.join(FOLDER, "EUAIMS_stratification.tsv"),
    "rois_mapper": os.path.join(FOLDER, "EUAIMS_rois.tsv"),
    "surf_stratification": os.path.join(
        FOLDER, "EUAIMS_surf_stratification.tsv")
}

DEFAULTS = {
    "clinical": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True,
        "drop_cols": ["t1:site", "t1:ageyrs", "t1:sex", "t1:fsiq",
                      "t1:group", "t1:diagnosis", "mri", "t1:group:name",
                      "qc", "labels", "subgroups"],
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "rois": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True, "adjust_sites": True,
        "metrics": ["lgi:avg", "thick:avg", "surf:area"],
        "roi_types": ["cortical"],
        "residualize_by": {"continuous": ["t1:ageyrs", "t1:fsiq"],
                           "discrete": ["t1:sex"]},
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "genetic": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True, "scores": None,
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "surface": {
        "test_size": 0.2, "seed": 42,
        "return_data": False, "z_score": True, "adjust_sites": True,
        "metrics": ["pial_lgi", "thickness"],
        "residualize_by": {"continuous": ["t1:ageyrs", "t1:fsiq"],
                           "discrete": ["t1:sex"]},
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    },
    "multiblock": {
        "test_size": 0.2, "seed": 42,
        "blocks": ["clinical", "surface-lh", "surface-rh", "genetic"],
        "qc": {"t1:fsiq": {"gte": 70},
               "mri": {"eq": 1},
               "qc": {"eq": "include"}}
    }
}
logger = logging.getLogger("pynet")


[docs]def apply_qc(data, prefix, qc):
    """ applies quality control to the data

    Parameters
    ----------
    data: pandas DataFrame
        data for which we control the quality
    prefix: string
        prefix of the column names
    qc: dict
        quality control dict. keys are the name of the columns
        to control on, and values dict containing an order relationsip
        and a value as items

    Returns
    -------
    data: pandas DataFrame
        selected data by the quality control
    """
    idx_to_keep = pd.Series([True] * len(data))

    relation_mapper = {
        "gt": lambda x, y: x > y,
        "lt": lambda x, y: x < y,
        "gte": lambda x, y: x >= y,
        "lte": lambda x, y: x <= y,
        "eq": lambda x, y: x == y,
    }
    for name, controls in qc.items():
        for relation, value in controls.items():
            if relation not in relation_mapper.keys():
                raise ValueError("The relationship {} provided is not a \
                    valid one".format(relation))
            elif "{}{}".format(prefix, name) in data.columns:
                new_idx = relation_mapper[relation](
                    data["{}{}".format(prefix, name)], value)
                idx_to_keep = idx_to_keep & new_idx
    return data[idx_to_keep]


[docs]def fetch_clinical_wrapper(datasetdir=SAVING_FOLDER, files=FILES,
                           cohort=COHORT_NAME, defaults=DEFAULTS['clinical']):
    """ Fetcher wrapper for clinical data

    Parameters
    ----------
    datasetdir: string, default SAVING_FOLDER
        path to the folder in which to save the data
    files: dict, default FILES
        contains the paths to the different files
    cohort: string, default COHORT_NAME,
        name of the cohort
    subject_columns_name: string, default 'subjects'
        name of the column containing the subjects id
    defaults: dict, default DEFAULTS
        default values for the wrapped function

    Returns
    -------
    fetcher: function
        corresponding fetcher.

    """

    fetcher_name = "fetcher_clinical_{}".format(cohort)

    # @Fetchers.register
    def fetch_clinical(
            test_size=defaults["test_size"], seed=defaults["seed"],
            return_data=defaults["return_data"], z_score=defaults["z_score"],
            drop_cols=defaults["drop_cols"], qc=defaults["qc"]):
        """ Fetches and preprocesses clinical data

        Parameters
        ----------
        test_size: float, default 0.2
            proportion of the dataset to keep for testing. Preprocessing models
            will only be fitted on the training part and applied to the test
            set. You can specify not to use a testing set by setting it to 0
        seed: int, default 42
            random seed to split the data into train / test
        return_data: bool, default False
            If false, saves the data in the specified folder, and return the
            path. Otherwise, returns the preprocessed data and the
            corresponding subjects
        z_score: bool, default True
            wether or not to transform the data into z_scores, meaning
            standardizing and scaling it
        drop_cols: list of string, see default
            names of the columns to drop before saving the data.
        qc: dict, see default
            keys are the name of the features the control on, values are the
            requirements on their values (see the function apply_qc)

        Returns
        -------
        item: namedtuple
            a named tuple containing 'train_input_path', 'train_metadata_path',
            and 'test_input_path', 'test_metadata_path' if test_size > 0
        X_train: numpy array,
            Training data, if return_data is True
        X_test: numpy array,
            Test data, if return_data is True and test_size > 0
        subj_train: numpy array,
            Training subjects, if return_data is True
        subj_test: numpy array,
            Test subjects, if return_data is True and test_size > 0
        """
        clinical_prefix = "bloc-clinical_score-"
        subject_column_name = "participant_id"
        path = os.path.join(datasetdir, "clinical_X_train.npy")
        meta_path = os.path.join(datasetdir, "clinical_X_train.tsv")
        path_test = None
        meta_path_test = None
        if test_size > 0:
            path_test = os.path.join(datasetdir, "clinical_X_test.npy")
            meta_path_test = os.path.join(datasetdir, "clinical_X_test.tsv")

        if not os.path.isfile(path):

            data = pd.read_csv(files["stratification"], sep="\t")
            clinical_cols = [subject_column_name]
            clinical_cols += [col for col in data.columns
                              if col.startswith(clinical_prefix)]
            data = data[clinical_cols]
            data_train = apply_qc(data, clinical_prefix, qc).sort_values(
                subject_column_name)
            data_train.columns = [elem.replace(clinical_prefix, "")
                                  for elem in data_train.columns]
            X_train = data_train.drop(columns=drop_cols)

            # Splits in train and test and removes nans
            X_test, subj_test = (None, None)
            if test_size > 0:
                X_train, X_test = train_test_split(
                    X_train, test_size=test_size, random_state=seed)
                na_idx_test = (X_test.isna().sum(1) == 0)
                X_test = X_test[na_idx_test]
                subj_test = X_test[subject_column_name].values
                X_test = X_test.drop(columns=[subject_column_name]).values
            na_idx_train = (X_train.isna().sum(1) == 0)
            X_train = X_train[na_idx_train]
            subj_train = X_train[subject_column_name].values
            X_train = X_train.drop(columns=[subject_column_name])
            cols = X_train.columns
            X_train = X_train.values

            # Standardizes and scales
            if z_score:
                scaler = RobustScaler()
                X_train = scaler.fit_transform(X_train)
                _path = os.path.join(datasetdir, "clinical_scaler.pkl")
                with open(_path, "wb") as f:
                    pickle.dump(scaler, f)
                if test_size > 0:
                    X_test = scaler.transform(X_test)

            # Return data and subjects

            X_train_df = pd.DataFrame(data=X_train, columns=cols)
            X_train_df.insert(0, subject_column_name, subj_train)
            X_test_df = None
            if test_size > 0:
                X_test_df = pd.DataFrame(data=X_test, columns=cols)
                X_test_df.insert(0, subject_column_name, subj_test)

            # Saving
            np.save(path, X_train)
            X_train_df.to_csv(meta_path, index=False, sep="\t")
            if test_size > 0:
                np.save(path_test, X_test)
                X_test_df.to_csv(meta_path_test, index=False, sep="\t")

        if return_data:
            X_train = np.load(path)
            subj_train = pd.read_csv(meta_path, sep="\t")[
                subject_column_name].values
            X_test, subj_test = (None, None)
            if test_size > 0:
                X_test = np.load(path_test)
                subj_test = pd.read_csv(meta_path_test, sep="\t")[
                    subject_column_name].values
            return X_train, X_test, subj_train, subj_test
        else:
            return Item(train_input_path=path, test_input_path=path_test,
                        train_metadata_path=meta_path,
                        test_metadata_path=meta_path_test)

    return fetch_clinical


[docs]def fetch_rois_wrapper(datasetdir=SAVING_FOLDER, files=FILES,
                       cohort=COHORT_NAME, site_column_name="t1:site",
                       defaults=DEFAULTS['rois']):
    """ Fetcher wrapper for rois data

    Parameters
    ----------
    datasetdir: string, default SAVING_FOLDER
        path to the folder in which to save the data
    files: dict, default FILES
        contains the paths to the different files
    cohort: string, default COHORT_NAME,
        name of the cohort
    site_columns_name: string, default "t1:site"
        name of the column containing the site of MRI acquisition
    defaults: dict, default DEFAULTS
        default values for the wrapped function

    Returns
    -------
    fetcher: function
        corresponding fetcher

    """

    fetcher_name = "fetcher_rois_{}".format(cohort)

    # @Fetchers.register
    def fetch_rois(
            metrics=defaults["metrics"], roi_types=defaults["roi_types"],
            test_size=defaults["test_size"], seed=defaults["seed"],
            return_data=defaults["return_data"], z_score=defaults["z_score"],
            adjust_sites=defaults["adjust_sites"],
            residualize_by=defaults["residualize_by"], qc=defaults["qc"]):
        """ Fetches and preprocesses roi data

        Parameters
        ----------
        datasetdir: string
            path to the folder in which to save the data
        metrics: list of strings, see default
            metrics to fetch
        roi_types: list of strings, default ["cortical"]
            type of rois to fetch. Must be one of "cortical", "subcortical"
            and "other"
        test_size: float, default 0.2
            proportion of the dataset to keep for testing. Preprocessing models
            will only be fitted on the training part and applied to the test
            set. You can specify not to use a testing set by setting it to 0
        seed: int, default 42
            random seed to split the data into train / test
        return_data: bool, default False
            If false, saves the data in the specified folder, and return the
            path. Otherwise, returns the preprocessed data and the
            corresponding subjects
        z_score: bool, default True
            wether or not to transform the data into z_scores, meaning
            standardizing and scaling it
        adjust_sites: bool, default True
            wether or not the correct site effects via the Combat algorithm
        residualize_by: dict, see default
            variables to residualize the data. Two keys, "continuous" and
            "discrete", and the values are a list of the variable names
        qc: dict, see default
            keys are the name of the features the control on, values are the
            requirements on their values (see the function apply_qc)

        Returns
        -------
        item: namedtuple
            a named tuple containing 'train_input_path', 'train_metadata_path',
            and 'test_input_path', 'test_metadata_path' if test_size > 0
        X_train: numpy array,
            Training data, if return_data is True
        X_test: numpy array,
            Test data, if return_data is True and test_size > 0
        subj_train: numpy array,
            Training subjects, if return_data is True
        subj_test: numpy array,
            Test subjects, if return_data is True and test_size > 0
        """
        clinical_prefix = "bloc-clinical_score-"
        roi_prefix = "bloc-t1w_roi-"
        subject_column_name = "participant_id"
        path = os.path.join(datasetdir, "rois_X_train.npy")
        meta_path = os.path.join(datasetdir, "rois_X_train.tsv")
        path_test = None
        meta_path_test = None
        if test_size > 0:
            path_test = os.path.join(datasetdir, "rois_X_test.npy")
            meta_path_test = os.path.join(datasetdir, "rois_X_test.tsv")

        if not os.path.isfile(path):
            data = pd.read_csv(files["stratification"], sep="\t")
            roi_mapper = pd.read_csv(files["rois_mapper"], sep="\t")

            # ROI selection
            roi_label_range = pd.Series([False] * len(roi_mapper))
            for roi_type in roi_types:
                if roi_type == "cortical":
                    roi_label_range = roi_label_range | (
                        (roi_mapper["labels"] > 11000) &
                        (roi_mapper["labels"] < 13000))
                elif roi_type == "subcortical":
                    roi_label_range = roi_label_range | (
                        roi_mapper["labels"] > 13000)
                elif roi_type == "other":
                    roi_label_range = roi_label_range | (
                        roi_mapper["labels"] < 11000)
                else:
                    raise ValueError("Roi types must be either 'cortical', \
                        'subcortical' or 'other'")
            roi_labels = roi_mapper.loc[roi_label_range, "labels"]

            # Feature selection
            features_list = []
            for column in data.columns:
                if column.startswith(roi_prefix):
                    roi = int(column.split(":")[1].split("_")[0])
                    metric = column.split("-")[-1]
                    if roi in roi_labels.values and metric in metrics:
                        features_list.append(column.replace(roi_prefix, ""))
            data_train = apply_qc(data, clinical_prefix, qc).sort_values(
                subject_column_name)
            data_train.columns = [elem.replace(roi_prefix, "")
                                  for elem in data_train.columns]
            X_train = data_train[features_list].copy()

            # Splits in train and test and removes nans
            if test_size > 0:
                X_train, X_test, data_train, data_test = train_test_split(
                    X_train, data_train, test_size=test_size,
                    random_state=seed)
                na_idx_test = (X_test.isna().sum(1) == 0)
                X_test = X_test[na_idx_test]
                data_test = data_test[na_idx_test]
                subj_test = data_test[subject_column_name].values
            na_idx_train = (X_train.isna().sum(1) == 0)
            X_train = X_train[na_idx_train]
            data_train = data_train[na_idx_train]
            subj_train = data_train[subject_column_name].values
            cols = X_train.columns

            # Correction for site effects
            if adjust_sites:
                for metric in metrics:
                    adjuster = fortin_combat()
                    features = [feature for feature in features_list
                                if metric in feature]
                    X_train[features] = adjuster.fit_transform(
                        X_train[features],
                        data_train[["{}{}".format(
                            clinical_prefix, site_column_name)]],
                        data_train[["{}{}".format(clinical_prefix, f)
                                    for f in residualize_by["discrete"]]],
                        data_train[["{}{}".format(clinical_prefix, f)
                                    for f in residualize_by["continuous"]]])

                    _path = os.path.join(
                        datasetdir, "rois_combat_{0}.pkl".format(metric))
                    with open(_path, "wb") as of:
                        pickle.dump(adjuster, of)

                    if test_size > 0:
                        X_test[features] = adjuster.transform(
                            X_test[features],
                            data_test[["{}{}".format(
                                clinical_prefix, site_column_name)]],
                            data_test[["{}{}".format(clinical_prefix, f)
                                       for f in residualize_by["discrete"]]],
                            data_test[["{}{}".format(clinical_prefix, f)
                                       for f in residualize_by["continuous"]]])

            # Standardizes
            if z_score:
                scaler = RobustScaler()
                X_train = scaler.fit_transform(X_train)
                _path = os.path.join(datasetdir, "rois_scaler.pkl")
                with open(_path, "wb") as f:
                    pickle.dump(scaler, f)
                if test_size > 0:
                    X_test = scaler.transform(X_test)
            else:
                X_train = X_train.values
                if test_size > 0:
                    X_test = X_test.values

            # Residualizes and scales
            if residualize_by is not None or len(residualize_by) > 0:
                regressor = LinearRegression()
                y_train = np.concatenate([
                    data_train[["{}{}".format(clinical_prefix, f)
                                for f in residualize_by["continuous"]]].values,
                    OneHotEncoder(sparse=False).fit_transform(
                        data_train[["{}{}".format(clinical_prefix, f)
                                    for f in residualize_by["discrete"]]])
                ], axis=1)
                regressor.fit(y_train, X_train)
                X_train = X_train - regressor.predict(y_train)
                _path = os.path.join(datasetdir, "rois_residualizer.pkl")
                with open(_path, "wb") as f:
                    pickle.dump(regressor, f)

                if test_size > 0:
                    y_test = np.concatenate([
                        data_test[[
                            "{}{}".format(clinical_prefix, f)
                            for f in residualize_by["continuous"]]].values,
                        OneHotEncoder(sparse=False).fit_transform(
                            data_test[["{}{}".format(clinical_prefix, f)
                                       for f in residualize_by["discrete"]]])
                    ], axis=1)
                    X_test = X_test - regressor.predict(y_test)

            # Return data and subjects
            X_train_df = pd.DataFrame(data=X_train, columns=cols)
            X_train_df.insert(0, subject_column_name, subj_train)
            X_test_df = None
            if test_size > 0:
                X_test_df = pd.DataFrame(data=X_test, columns=cols)
                X_test_df.insert(0, subject_column_name, subj_test)

            # Saving
            np.save(path, X_train)
            X_train_df.to_csv(meta_path, index=False, sep="\t")
            if test_size > 0:
                np.save(path_test, X_test)
                X_test_df.to_csv(meta_path_test, index=False, sep="\t")

        if return_data:
            X_train = np.load(path)
            subj_train = pd.read_csv(meta_path, sep="\t")[
                subject_column_name].values
            X_test, subj_test = (None, None)
            if test_size > 0:
                X_test = np.load(path_test)
                subj_test = pd.read_csv(meta_path_test, sep="\t")[
                    subject_column_name].values
            return X_train, X_test, subj_train, subj_test
        else:
            return Item(train_input_path=path, test_input_path=path_test,
                        train_metadata_path=meta_path,
                        test_metadata_path=meta_path_test)

    return fetch_rois


[docs]def fetch_surface_wrapper(hemisphere, datasetdir=SAVING_FOLDER,
                          files=FILES, cohort=COHORT_NAME,
                          site_column_name="t1:site",
                          defaults=DEFAULTS["surface"]):
    """ Fetcher wrapper for surface data

    Parameters
    ----------
    hemisphere: string
        name of the hemisphere data fetcher, one of "rh" or "lh"
    datasetdir: string, default SAVING_FOLDER
        path to the folder in which to save the data
    files: dict, default FILES
        contains the paths to the different files
    cohort: string, default COHORT_NAME,
        name of the cohort
    site_columns_name: string, default "t1:site"
        name of the column containing the site of MRI acquisition
    defaults: dict, default DEFAULTS
        default values for the wrapped function

    Returns
    -------
    fetcher: function
        corresponding fetcher

    """
    assert(hemisphere in ["rh", "lh"])
    fetcher_name = "fetcher_surface_{}_{}".format(hemisphere, cohort)

    # @Fetchers.register
    def fetch_surface(
            metrics=defaults["metrics"],
            test_size=defaults["test_size"], seed=defaults["seed"],
            return_data=defaults["return_data"],
            z_score=defaults["z_score"], adjust_sites=defaults["adjust_sites"],
            residualize_by=defaults["residualize_by"], qc=defaults["qc"]):
        """ Fetches and preprocesses surface data

        Parameters
        ----------
        metrics: list of strings, see defaults
            metrics to fetch
        test_size: float, default 0.2
            proportion of the dataset to keep for testing. Preprocessing models
            will only be fitted on the training part and applied to the test
            set. You can specify not to use a testing set by setting it to 0
        seed: int, default 42
            random seed to split the data into train / test
        return_data: bool, default False
            If false, saves the data in the specified folder, and return the
            path. Otherwise, returns the preprocessed data and the
            corresponding subjects
        z_score: bool, default True
            wether or not to transform the data into z_scores, meaning
            standardizing and scaling it
        adjust_sites: bool, default True
            wether or not the correct site effects via the Combat algorithm
        residualize_by: dict, see default
            variables to residualize the data. Two keys, "continuous" and
            "discrete", and the values are a list of the variable names
        qc: dict, see default
            keys are the name of the features the control on, values are the
            requirements on their values (see the function apply_qc)

        Returns
        -------
        item: namedtuple
            a named tuple containing 'train_input_path', 'train_metadata_path',
            and 'test_input_path', 'test_metadata_path' if test_size > 0
        X_train: numpy array,
            Training data, if return_data is True
        X_test: numpy array,
            Test data, if return_data is True and test_size > 0
        subj_train: numpy array,
            Training subjects, if return_data is True
        subj_test: numpy array,
            Test subjects, if return_data is True and test_size > 0
        """

        clinical_prefix = "bloc-clinical_score-"

        surf_prefix = "bloc-t1w_hemi-{}_metric".format(hemisphere)

        data = pd.read_csv(files["clinical_surface"], sep="\t").drop(
            columns=["bloc-t1w_hemi-lh_metric-area",
                     "bloc-t1w_hemi-rh_metric-area"])

        # Feature selection
        features_list = []
        for metric in metrics:
            for column in data.columns:
                if column.startswith(surf_prefix):
                    m = column.split('-')[-1]
                    if m == metric:
                        features_list.append(column)

        data_train = apply_qc(data, clinical_prefix, qc).sort_values(
            "participant_id")

        # Loads surface data
        n_vertices = len(
            surface_loader(data_train[features_list[0]].iloc[0]).get_data())
        X_train = np.zeros((len(data_train), n_vertices, len(features_list)))
        for i in range(len(data_train)):
            for j, feature in enumerate(features_list):
                path = data_train[feature].iloc[i]
                if not pd.isnull([path]):
                    X_train[i, :, j] = surface_loader(
                        path).get_data().squeeze()

        # Splits in train and test and removes nans
        if test_size > 0:
            X_train, X_test, data_train, data_test = train_test_split(
                X_train, data_train, test_size=test_size, random_state=seed)

            na_idx_test = (np.isnan(X_test).sum((1, 2)) == 0)
            X_test = X_test[na_idx_test]
            data_test = data_test[na_idx_test]
            if return_data:
                subj_test = data_test["participant_id"].values

        na_idx_train = (np.isnan(X_train).sum((1, 2)) == 0)

        X_train = X_train[na_idx_train]
        data_train = data_train[na_idx_train]
        if return_data:
            subj_train = data_train["participant_id"].values

        # Applies feature-wise preprocessing
        for i, feature in enumerate(features_list):
            # Correction for site effects
            if adjust_sites:
                non_zeros_idx = (X_train[:, :, i] > 0).sum(0) >= 1
                adjuster = fortin_combat()
                X_train[:, non_zeros_idx, i] = adjuster.fit_transform(
                    X_train[:, non_zeros_idx, i],
                    data_train[["{}{}".format(
                        clinical_prefix, site_column_name)]],
                    data_train[["{}{}".format(clinical_prefix, f)
                                for f in residualize_by["discrete"]]],
                    data_train[["{}{}".format(clinical_prefix, f)
                                for f in residualize_by["continuous"]]])

                path = os.path.join(
                    datasetdir,
                    "surface_{}_combat_feature{}.pkl".format(hemisphere, i))
                with open(path, "wb") as f:
                    pickle.dump(adjuster, f)

                if test_size > 0:
                    X_test[:, non_zeros_idx, i] = adjuster.transform(
                        X_test[:, non_zeros_idx, i],
                        data_test[["{}{}".format(
                            clinical_prefix, site_column_name)]],
                        data_test[["{}{}".format(clinical_prefix, f)
                                   for f in residualize_by["discrete"]]],
                        data_test[["{}{}".format(clinical_prefix, f)
                                   for f in residualize_by["continuous"]]])

            # Standardizes and scales
            if z_score:
                scaler = RobustScaler()
                X_train[:, :, i] = scaler.fit_transform(X_train[:, :, i])

                path = os.path.join(
                    datasetdir,
                    "surface_{}_scaler_feature{}.pkl".format(hemisphere, i))
                with open(path, "wb") as f:
                    pickle.dump(scaler, f)
                if test_size > 0:
                    X_test[:, :, i] = scaler.transform(X_test[:, :, i])

            # Residualizes
            if residualize_by is not None or len(residualize_by) > 0:
                regressor = LinearRegression()
                y_train = np.concatenate([
                    data_train[["{}{}".format(clinical_prefix, f)
                                for f in residualize_by["continuous"]]].values,
                    OneHotEncoder(sparse=False).fit_transform(
                        data_train[["{}{}".format(clinical_prefix, f)
                                    for f in residualize_by["discrete"]]])
                ], axis=1)
                regressor.fit(y_train, X_train[:, :, i])
                X_train[:, :, i] = X_train[:, :, i] - regressor.predict(
                    y_train)
                path = os.path.join(
                    datasetdir,
                    "surface_{}_residualizer_feature{}.pkl".format(
                        hemisphere, i))
                with open(path, "wb") as f:
                    pickle.dump(regressor, f)

                if test_size > 0:
                    y_test = np.concatenate([
                        data_test[["{}{}".format(clinical_prefix, f)
                                   for f in residualize_by["continuous"]]
                                  ].values,
                        OneHotEncoder(sparse=False).fit_transform(
                            data_test[["{}{}".format(clinical_prefix, f)
                                       for f in residualize_by["discrete"]]])
                    ], axis=1)
                    X_test[:, :, i] = X_test[:, :, i] - regressor.predict(
                        y_test)

        # Returns data and subjects
        if return_data:
            if test_size > 0:
                return X_train, X_test, subj_train, subj_test
            return X_train, subj_train

        # Saving
        path = os.path.join(
            datasetdir, "surface_{}_X_train.npy".format(hemisphere))
        np.save(path, X_train)
        if test_size > 0:
            path_test = os.path.join(
                datasetdir, "surface_{}_X_test.npy".format(hemisphere))
            np.save(path_test, X_test)
            return path, path_test

        return path
    return fetch_surface


[docs]def fetch_genetic_wrapper(datasetdir=SAVING_FOLDER, files=FILES,
                          cohort=COHORT_NAME, defaults=DEFAULTS['genetic']):
    """ Fetcher wrapper for genetic data

    Parameters
    ----------
    datasetdir: string, default SAVING_FOLDER
        path to the folder in which to save the data
    files: dict, default FILES
        contains the paths to the different files
    cohort: string, default COHORT_NAME,
        name of the cohort
    defaults: dict, default DEFAULTS
        default values for the wrapped function

    Returns
    -------
    fetcher: function
        corresponding fetcher

    """

    fetcher_name = "fetcher_genetic_{}".format(cohort)

    # @Fetchers.register
    def fetch_genetic(
            scores=defaults["scores"], test_size=defaults["test_size"],
            seed=defaults["seed"], return_data=defaults["return_data"],
            z_score=defaults["z_score"], qc=defaults["qc"]):
        """ Fetches and preprocesses genetic data

        Parameters
        ----------
        scores: list of strings, see defaults
            scores to fetch, None mean it fetches all the available scores
        test_size: float, see defaults
            proportion of the dataset to keep for testing. Preprocessing models
            will only be fitted on the training part and applied to the test
            set. You can specify not to use a testing set by setting it to 0
        seed: int, see default
            random seed to split the data into train / test
        return_data: bool, default False
            If false, saves the data in the specified folder, and return the
            path. Otherwise, returns the preprocessed data and the
            corresponding subjects
        z_score: bool, see defaults
            wether or not to transform the data into z_scores, meaning
            standardizing and scaling it
        qc: dict, see defaults
            keys are the name of the features the control on, values are the
            requirements on their values (see the function apply_qc)

        Returns
        -------
        item: namedtuple
            a named tuple containing 'train_input_path', 'train_metadata_path',
            and 'test_input_path', 'test_metadata_path' if test_size > 0
        X_train: numpy array
            Training data, if return_data is True
        X_test: numpy array
            Test data, if return_data is True and test_size > 0
        subj_train: numpy array
            Training subjects, if return_data is True
        subj_test: numpy array
            Test subjects, if return_data is True and test_size > 0
        """

        clinical_prefix = "bloc-clinical_score-"
        genetic_prefix = "bloc-genetic_score-"
        subject_column_name = "participant_id"
        path = os.path.join(datasetdir, "genetic_X_train.npy")
        meta_path = os.path.join(datasetdir, "genetic_X_train.tsv")
        path_test = None
        meta_path_test = None
        if test_size > 0:
            path_test = os.path.join(datasetdir, "genetic_X_test.npy")
            meta_path_test = os.path.join(datasetdir, "genetic_X_test.tsv")

        if not os.path.isfile(path):

            data = pd.read_csv(files["stratification"], sep="\t")

            # Feature selection
            features_list = []
            for column in data.columns:
                if column.startswith(genetic_prefix):
                    score = column.split("-")[-1]
                    if scores is not None and score in scores:
                        features_list.append(
                            column.replace(genetic_prefix, ""))
                    elif scores is None:
                        features_list.append(
                            column.replace(genetic_prefix, ""))
            data_train = apply_qc(data, clinical_prefix, qc).sort_values(
                subject_column_name)
            data_train.columns = [elem.replace(genetic_prefix, "")
                                  for elem in data_train.columns]
            X_train = data_train[features_list].copy()

            # Splits in train and test and removes nans
            if test_size > 0:
                X_train, X_test, data_train, data_test = train_test_split(
                    X_train, data_train, test_size=test_size,
                    random_state=seed)
                na_idx_test = (X_test.isna().sum(1) == 0)
                X_test = X_test[na_idx_test]
                data_test = data_test[na_idx_test]
                subj_test = data_test[subject_column_name].values
            na_idx_train = (X_train.isna().sum(1) == 0)
            X_train = X_train[na_idx_train]
            data_train = data_train[na_idx_train]
            subj_train = data_train[subject_column_name].values
            cols = X_train.columns

            # Standardizes and scales
            if z_score:
                scaler = RobustScaler()
                X_train = scaler.fit_transform(X_train)
                _path = os.path.join(datasetdir, "genetic_scaler.pkl")
                with open(_path, "wb") as f:
                    pickle.dump(scaler, f)
                if test_size > 0:
                    X_test = scaler.transform(X_test)
            else:
                X_train = X_train.values
                if test_size > 0:
                    X_test = X_test.values

            # Return data and subjects
            X_train_df = pd.DataFrame(data=X_train, columns=cols)
            X_train_df.insert(0, subject_column_name, subj_train)
            X_test_df = None
            if test_size > 0:
                X_test_df = pd.DataFrame(data=X_test, columns=cols)
                X_test_df.insert(0, subject_column_name, subj_test)

            # Saving
            np.save(path, X_train)
            X_train_df.to_csv(meta_path, index=False, sep="\t")
            if test_size > 0:
                np.save(path_test, X_test)
                X_test_df.to_csv(meta_path_test, index=False, sep="\t")

        if return_data:
            X_train = np.load(path)
            subj_train = pd.read_csv(meta_path, sep="\t")[
                subject_column_name].values
            X_test, subj_test = (None, None)
            if test_size > 0:
                X_test = np.load(path_test)
                subj_test = pd.read_csv(meta_path_test, sep="\t")[
                    subject_column_name].values
            return X_train, X_test, subj_train, subj_test
        else:
            return Item(train_input_path=path, test_input_path=path_test,
                        train_metadata_path=meta_path,
                        test_metadata_path=meta_path_test)
    return fetch_genetic


[docs]def make_fetchers(datasetdir=SAVING_FOLDER):

    return {
        "clinical": fetch_clinical_wrapper(datasetdir=datasetdir),
        "rois": fetch_rois_wrapper(datasetdir=datasetdir),
        "surface-rh": fetch_surface_wrapper(hemisphere="rh",
                                            datasetdir=datasetdir),
        "surface-lh": fetch_surface_wrapper(hemisphere="lh",
                                            datasetdir=datasetdir),
        "genetic": fetch_genetic_wrapper(datasetdir=datasetdir),
    }


[docs]def fetch_multiblock_wrapper(datasetdir=SAVING_FOLDER, files=FILES,
                             cohort=COHORT_NAME,
                             subject_column_name="subjects",
                             defaults=DEFAULTS["multiblock"],
                             make_fetchers_func=make_fetchers):
    """ Fetcher wrapper for multiblock data

    Parameters
    ----------
    datasetdir: string, default SAVING_FOLDER
        path to the folder in which to save the data
    files: dict, default FILES
        contains the paths to the different files
    cohort: string, default COHORT_NAME,
        name of the cohort
    subject_columns_name: string, default "subjects"
        name of the column containing the subjects id
    defaults: dict, default DEFAULTS
        default values for the wrapped function
    make_fetchers_func: function, default make_fetchers
        function to build the fetchers from their wrappers.
        Must return a dict containing as keys the name of the
        channels, and values the corresponding fetcher

    Returns
    -------
    fetcher: function
        corresponding fetcher

    """

    fetcher_name = "fetcher_multiblock_{}".format(cohort)
    FETCHERS = make_fetchers_func(datasetdir)

    # @Fetchers.register
    def fetch_multiblock(
            blocks=defaults["blocks"],
            test_size=defaults["test_size"], seed=defaults["seed"],
            qc=defaults["qc"],
            **kwargs):
        """ Fetches and preprocesses multi block data

        Parameters
        ----------
        blocks: list of strings, see default
            blocks of data to fetch, all must be in the key list of FETCHERS
        test_size: float, default 0.2
            proportion of the dataset to keep for testing. Preprocessing models
            will only be fitted on the training part and applied to the test
            set. You can specify not to use a testing set by setting it to 0
        seed: int, default 42
            random seed to split the data into train / test
        qc: dict, see default
            keys are the name of the features the control on, values are the
            requirements on their values (see the function apply_qc)
        kwargs: dict
            additional arguments to be passed to each fetcher indivudally.
            Keys are the name of the fetchers, and values are a dictionnary
            containing arguments and the values for this fetcher

        Returns
        -------
        item: namedtuple
            a named tuple containing 'train_input_path', 'train_metadata_path',
            and 'test_input_path', 'test_metadata_path' if test_size > 0
        """

        path = os.path.join(datasetdir, "multiblock_X_train.npz")
        metadata_path = os.path.join(datasetdir, "metadata_train.tsv")
        path_test = None
        metadata_path_test = None
        if test_size > 0:
            path_test = os.path.join(datasetdir, "multiblock_X_test.npz")
            metadata_path_test = os.path.join(
                datasetdir, "metadata_test.tsv")

        if not os.path.isfile(path):
            X_train = {}
            subj_train = {}
            if test_size > 0:
                X_test = {}
                subj_test = {}
            for block in blocks:
                assert block in FETCHERS.keys()
                if block in kwargs.keys():
                    local_kwargs = kwargs[block]
                    # Impose to have the same qc steps and splitting train/test
                    # over all the blocks to have the same subjects
                    for key, value in local_kwargs.items():
                        if key in ["qc", "test_size", "seed"]:
                            del local_kwargs[key]
                else:
                    local_kwargs = {}
                new_X_train, new_X_test, new_subj_train, new_subj_test = \
                    FETCHERS[block](
                        qc=qc, test_size=test_size, seed=seed,
                        return_data=True, **local_kwargs)
                if test_size > 0:
                    X_test[block] = new_X_test
                    subj_test[block] = new_subj_test
                X_train[block] = new_X_train
                subj_train[block] = new_subj_train

            # Remove subjects that arent in all the channels
            common_subjects_train = list(
                set.intersection(*map(set, subj_train.values())))
            for block in blocks:
                subjects = subj_train[block]
                assert(len(subjects) == len(X_train[block]))
                idx_to_keep = [
                    _idx for _idx in range(len(subjects))
                    if subjects[_idx] in common_subjects_train]
                X_train[block] = X_train[block][idx_to_keep]

            if test_size > 0:
                common_subjects_test = list(
                    set.intersection(*map(set, subj_test.values())))
                for block in blocks:
                    subjects = subj_test[block]
                    assert(len(subjects) == len(X_test[block]))
                    idx_to_keep = [
                        _idx for _idx in range(len(subjects))
                        if subjects[_idx] in common_subjects_test]
                    X_test[block] = X_test[block][idx_to_keep]

            # Loads metadata
            clinical_prefix = "bloc-clinical_score-"
            metadata_cols = ["participant_id", "labels", "subgroups"]
            metadata = pd.read_csv(files["stratification"], sep="\t")
            clinical_cols = ["participant_id"]
            clinical_cols += [col for col in metadata.columns
                              if col.startswith(clinical_prefix)]
            metadata = metadata[clinical_cols]
            metadata.columns = [elem.replace(clinical_prefix, "")
                                for elem in metadata.columns]
            metadata = metadata[metadata_cols]
            metadata_train = metadata[
                metadata[subject_column_name].isin(common_subjects_train)]
            if test_size > 0:
                metadata_test = metadata[
                    metadata[subject_column_name].isin(common_subjects_test)]

            # Saving
            np.savez(path, **X_train)
            metadata_train.to_csv(metadata_path, index=False, sep="\t")
            if test_size > 0:
                np.savez(path_test, **X_test)
                metadata_test.to_csv(metadata_path_test, index=False, sep="\t")

        return Item(train_input_path=path, test_input_path=path_test,
                    train_metadata_path=metadata_path,
                    test_metadata_path=metadata_path_test)

    return fetch_multiblock


WRAPPERS = {
    "clinical": fetch_clinical_wrapper,
    "rois": fetch_rois_wrapper,
    "genetic": fetch_genetic_wrapper,
    "surface": fetch_surface_wrapper,
    "multiblock": fetch_multiblock_wrapper,
}


[docs]def fetch_multiblock_euaims(datasetdir=SAVING_FOLDER, fetchers=make_fetchers,
                            surface=False):
    if surface:
        DEFAULTS["multiblock"]["blocks"] = ["clinical", "surface-lh",
                                            "surface-rh", "genetic"]
    else:
        DEFAULTS["multiblock"]["blocks"] = ["clinical", "rois", "genetic"]
    return WRAPPERS["multiblock"](
        datasetdir=datasetdir, files=FILES, cohort=COHORT_NAME,
        subject_column_name="participant_id", defaults=DEFAULTS["multiblock"],
        make_fetchers_func=make_fetchers)()


[docs]def inverse_normalization(data, scalers):
    """ De-normalize a dataset.
    """
    for scaler_path in scalers:
        with open(scaler_path, "rb") as of:
            scaler = pickle.load(of)
        data = scaler.inverse_transform(data)
    return data
Helper Module for Deep Learning.

Source code for pynet.datasets.euaims

Follow us