Menu

Helper Module for Deep Learning.

Source code for pynet.datasets.impac

# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2019 - 2020
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################

"""
Module that provides functions to prepare the IMPAC dataset. IMPAC stands for
IMaging-PsychiAtry Challenge: predicting autism which is a data challenge on
Autism Spectrum Disorder detection:
https://paris-saclay-cds.github.io/autism_challenge.
"""

# Imports
import re
import os
import json
import glob
import shutil
import logging
import requests
import zipfile
import hashlib
import warnings
from collections import namedtuple
from collections import OrderedDict
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from pynet.datasets import Fetchers
try:
    from nilearn.connectome import ConnectivityMeasure
except:
    warnings.warn("You need to install nilearn.")


# Global parameters
ATLAS = ("basc064", "basc122", "basc197", "craddock_scorr_mean",
         "harvard_oxford_cort_prob_2mm", "msdl", "power_2011")
ARCHIVE = {
    atlas: 'https://zenodo.org/record/3625740/files/{0}.zip'.format(atlas)
    for atlas in ATLAS}
CHECKSUM = {
    "basc064":
    "75eb5ee72344d11f056551310a470d00227fac3e87b7205196f77042fcd434d0",
    "basc122":
    "2d0d2c2338f9114877a0a1eb695e73f04fc664065d1fb75cff8d59f6516b0ec7",
    "basc197":
    "68135bb8e89b5b3653e843745d8e5d0e92876a5536654eaeb9729c9a52ab00e9",
    "craddock_scorr_mean":
    "634e0bb07beaae033a0f1615aa885ba4cb67788d4a6e472fd432a1226e01b49b",
    "harvard_oxford_cort_prob_2mm":
    "638559dc4c7de25575edc02e58404c3f2600556239888cbd2e5887316def0e74",
    "msdl":
    "fd241bd66183d5fc7bdf9a115d7aeb9a5fecff5801cd15a4e5aed72612916a97",
    "power_2011":
    "d1e3cd8eaa867079fe6b24dfaee08bd3b2d9e0ebbd806a2a982db5407328990a"}
URL = "https://raw.githubusercontent.com/ramp-kits/autism/master/data/"
URLS = [URL + name for name in ["anatomy.csv", "anatomy_qc.csv",
                                "fmri_filename.csv", "fmri_qc.csv",
                                "fmri_repetition_time.csv",
                                "participants.csv", "test.csv", "train.csv"]]
Item = namedtuple("Item", ["input_path", "output_path", "metadata_path",
                           "labels", "nb_features"])
logger = logging.getLogger("pynet")


def _sha256(path):
    """ Calculate the sha256 hash of the file at path.
    """
    sha256hash = hashlib.sha256()
    chunk_size = 8192
    with open(path, "rb") as f:
        while True:
            buffer = f.read(chunk_size)
            if not buffer:
                break
            sha256hash.update(buffer)
    return sha256hash.hexdigest()


def _check_and_unzip(zip_file, atlas, atlas_directory):
    checksum_download = _sha256(zip_file)
    if checksum_download != CHECKSUM[atlas]:
        os.remove(zip_file)
        raise IOError("The file downloaded was corrupted. Try again "
                      "to execute this fetcher.")
    logger.info("Decompressing the archive...")
    zip_ref = zipfile.ZipFile(zip_file, "r")
    zip_ref.extractall(atlas_directory)
    zip_ref.close()


def _download_fmri_data(atlas, outdir):
    logger.info("Downloading the data from {0}...".format(ARCHIVE[atlas]))
    zip_file = os.path.join(outdir, atlas + ".zip")
    if os.path.isfile(zip_file):
        logger.info("'{0}' already downloaded!".format(zip_file))
    else:
        response = requests.get(ARCHIVE[atlas])
        with open(zip_file, "wb") as of:
            of.write(response.content)
    atlas_directory = os.path.join(outdir, "data", "fmri")
    if not os.path.isdir(atlas_directory):
        _check_and_unzip(zip_file, atlas, atlas_directory)


[docs]def fetch_fmri_time_series(outdir, atlas="all"): """ Fetch the time-series extracted from the fMRI data using a specific atlas. Parameters ---------- outdir: str the detination folder. atlas : string, default='all' The name of the atlas used during the extraction. The possibilities are: * `'basc064`, `'basc122'`, `'basc197'`: BASC parcellations with 64, 122, and 197 regions [1]_; * `'craddock_scorr_mean'`: Ncuts parcellations [2]_; * `'harvard_oxford_cort_prob_2mm'`: Harvard-Oxford anatomical parcellations; * `'msdl'`: MSDL functional atlas [3]_; * `'power_2011'`: Power atlas [4]_. References ---------- .. [1] Bellec, Pierre, et al. "Multi-level bootstrap analysis of stable clusters in resting-state fMRI." Neuroimage 51.3 (2010): 1126-1139. .. [2] Craddock, R. Cameron, et al. "A whole brain fMRI atlas generated via spatially constrained spectral clustering." Human brain mapping 33.8 (2012): 1914-1928. .. [3] Varoquaux, Gaƫl, et al. "Multi-subject dictionary learning to segment an atlas of brain spontaneous activity." Biennial International Conference on Information Processing in Medical Imaging. Springer, Berlin, Heidelberg, 2011. .. [4] Power, Jonathan D., et al. "Functional network organization of the human brain." Neuron 72.4 (2011): 665-678. """ if atlas == "all": for single_atlas in ATLAS: _download_fmri_data(single_atlas, outdir) elif atlas in ATLAS: _download_fmri_data(atlas, outdir) else: raise ValueError( "'atlas' should be one of {0}. Got {1} instead.".format( ATLAS, atlas)) logger.info("Downloading completed...")
def _load_fmri(fmri_filenames): """ Load time-series extracted from the fMRI using a specific atlas. """ return np.array([pd.read_csv(subject_filename, header=None).values for subject_filename in fmri_filenames])
[docs]class FeatureExtractor(BaseEstimator, TransformerMixin): """ Make a transformer which will load the time series and compute the connectome matrix. """
[docs] def __init__(self): self.transformer_fmri = make_pipeline( FunctionTransformer(func=_load_fmri, validate=False), ConnectivityMeasure(kind="tangent", vectorize=True))
[docs] def fit(self, X_df, y, datadir): fmri_filenames = [path.replace(".", datadir, 1) for path in X_df["fmri_basc122"]] self.transformer_fmri.fit(fmri_filenames, y) return self
[docs] def transform(self, X_df, datadir): fmri_filenames = [path.replace(".", datadir, 1) for path in X_df["fmri_basc122"]] X_connectome = self.transformer_fmri.transform(fmri_filenames) X_connectome = pd.DataFrame(X_connectome, index=X_df.index) X_connectome.columns = ["connectome_{0}".format(i) for i in range(X_connectome.columns.size)] X_anatomy = X_df[[col for col in X_df.columns if col.startswith("anatomy")]] X_anatomy = X_anatomy.drop(columns="anatomy_select") logger.debug(" X connectome: {0}".format(X_connectome.shape)) logger.debug(" X anatomy: {0}".format(X_anatomy.shape)) return pd.concat([X_connectome, X_anatomy], axis=1)
[docs]@Fetchers.register def fetch_impac(datasetdir, mode="train", dtype="all"): """ Fetch/prepare the IMPAC dataset for pynet. To compute the functional connectivity using the rfMRI data, we use the BASC atlas with 122 ROIs. Parameters ---------- datasetdir: str the dataset destination folder. mode: str ask the 'train' or 'test' dataset. dtype: str, default 'all' the features type: 'anatomy', 'fmri', or 'all'. Returns ------- item: namedtuple a named tuple containing 'input_path', 'output_path', and 'metadata_path'. """ logger.info("Loading impac dataset.") if not os.path.isdir(datasetdir): os.mkdir(datasetdir) train_desc_path = os.path.join(datasetdir, "pynet_impac_train.tsv") selected_input_path = os.path.join( datasetdir, "pynet_impac_inputs_selection.npy") train_input_path = os.path.join( datasetdir, "pynet_impac_inputs_train.npy") train_output_path = os.path.join( datasetdir, "pynet_impac_outputs_train.npy") test_desc_path = os.path.join(datasetdir, "pynet_impac_test.tsv") test_input_path = os.path.join( datasetdir, "pynet_impac_inputs_test.npy") test_output_path = os.path.join( datasetdir, "pynet_impac_outputs_test.npy") if not os.path.isfile(train_desc_path): fetch_fmri_time_series(datasetdir, atlas="basc122") data = [] sets = {} for url in URLS: basename = url.split("/")[-1] name = basename.split(".")[0] local_file = os.path.join(datasetdir, basename) if not os.path.isfile(local_file): response = requests.get(url, stream=True) with open(local_file, "wt") as out_file: out_file.write(response.text) del response else: logger.info("'{0}' already downloaded!".format(basename)) if name not in ("train", "test"): prefix = name.split("_")[0] df = pd.read_csv(local_file, index_col=0) df.columns = [ "{0}_{1}".format(prefix, col) for col in df.columns] data.append(df) else: df = pd.read_csv(local_file, header=None) sets[name] = df[0].values.tolist() data = pd.concat(data, axis=1) logger.debug(" data: {0}".format(data.shape)) data = data[data["anatomy_select"].isin((1, 2))] data = data[data["fmri_select"].isin((1, 2))] logger.debug(" filter data: {0}".format(data.shape)) data_train = data[data.index.isin(sets["train"])] data_test = data[data.index.isin(sets["test"])] y_train = data_train["participants_asd"] y_test = data_test["participants_asd"] logger.debug(" data train: {0}".format(data_train.shape)) logger.debug(" data test: {0}".format(data_test.shape)) logger.debug(" y train: {0}".format(y_train.shape)) logger.debug(" y test: {0}".format(y_test.shape)) features = FeatureExtractor() features.fit(data_train, y_train, datasetdir) features_train = features.transform(data_train, datasetdir) features.fit(data_test, y_test, datasetdir) features_test = features.transform(data_test, datasetdir) logger.debug(" features train: {0}".format(features_train.shape)) logger.debug(" features test: {0}".format(features_test.shape)) np.save(train_input_path, features_train) np.save(train_output_path, y_train) np.save(test_input_path, features_test.values) np.save(test_output_path, y_test.values) data_train.to_csv(train_desc_path, sep="\t", index=False) data_test.to_csv(test_desc_path, sep="\t", index=False) if mode == "train": input_path, output_path, desc_path = ( train_input_path, train_output_path, train_desc_path) else: input_path, output_path, desc_path = ( test_input_path, test_output_path, test_desc_path) features = np.load(input_path) if dtype == "anatomy": features = features[:, 7503:] elif dtype == "fmri": features = features[:, :7503] nb_features = features.shape[1] np.save(selected_input_path, features) return Item(input_path=selected_input_path, output_path=None, metadata_path=desc_path, labels=None, nb_features=nb_features)

Follow us

© 2019, pynet developers .
Inspired by AZMIND template.