Menu

Helper Module for Deep Learning.

Source code for pynet.datasets.tcga_lgg_tif

# -*- coding: utf-8 -*-
##########################################################################
# NSAp - Copyright (C) CEA, 2020
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################

"""
Module that provides functions to prepare the TCGA-LGG-tif dataset.
"""

# Imports
from collections import OrderedDict
import os
import logging
from collections import namedtuple
import numpy as np
import skimage.io as skio
import pandas as pd
from pynet.datasets import Fetchers
import progressbar
import csv
import glob

# Global parameters
Item = namedtuple("Item", ["input_path", "output_path",
                           "metadata_path", "height", "width"])
URL = "https://www.kaggle.com/mateuszbuda/lgg-mri-segmentation/download"
logger = logging.getLogger("pynet")
height, width = (256, 256)


[docs]def read_metadata(metadata_file): with open(metadata_file) as f: metadata_dict = csv.DictReader(f) metadata_dict = { row["Patient"].split("_")[-1]: row for row in metadata_dict} return metadata_dict
[docs]def get_slice_id(fp): return int( fp.replace("_mask.tif", "").replace(".tif", "").split("_")[-1])
[docs]def get_subjects_files(datadir): sdata = {} for fp in glob.glob( os.path.join(datadir, "*", "*", "*_mask.tif")): dirname = fp.split(os.sep)[-2] _, center, subject, serie = dirname.split("_") if subject not in sdata: sdata[subject] = {"center": center, "serie": serie, "masks": [], "images": []} sdata[subject]["masks"].append(fp) sdata[subject]["images"].append(fp.replace("_mask", "")) return sdata
[docs]@Fetchers.register def fetch_tcga_lgg_tif(datasetdir): """ Fetch/prepare the TCA-LGG-tif dataset for pynet. The patient average age was 47 with an almost even split between women and men (56 vs. 53, 1 unknown) in our dataset. Histologically, the tumors were divided between oligodendroglioma (47), astrocytoma (33), and oligoastrocytoma (29). Histology of one tumor was unknown. The data included grade II (51) and grade III (58) tumors with grade of one tumor unknown. Parameters ---------- datasetdir: str the dataset destination folder. Returns ------- item: namedtuple a named tuple containing 'input_path', 'output_path', 'metadata_path', 'height' and 'width'. """ logger.info("Loading TCA-LGG-tif dataset.") if not os.path.isdir(datasetdir): raise ValueError( "You must first download the kaggle dataset at {} and unzip it " "to {}.".format(URL, datasetdir)) metadata_path = os.path.join(datasetdir, "kaggle_3m", "data.csv") desc_path = os.path.join(datasetdir, "pynet_tgca-lgg-tif.tsv") input_path = os.path.join(datasetdir, "pynet_tgca-lgg-tif_inputs.npy") output_path = os.path.join(datasetdir, "pynet_tgca-lgg-tif_outputs.npy") if not os.path.isfile(desc_path): # parse datasetdir sdata = get_subjects_files(datasetdir) # parse genetics csv file smetadata = read_metadata(metadata_path) input_dataset = [] output_dataset = [] metadata = OrderedDict((key, []) for key in ( "participant_id", "slice_id", "center", "serie")) with progressbar.ProgressBar(max_value=len(sdata), redirect_stdout=True) as bar: for cnt, (subject, subject_data) in enumerate(sdata.items()): logger.debug("Processing {0}...".format(subject)) for impath in subject_data["images"]: # (height, width, (precontrast, flair, postcontrast)) # -> ((precontrast, flair, postcontrast), height, width) im = skio.imread(impath).transpose(2, 0, 1) input_dataset.append(im) # Get subject genetics metadata metadata["participant_id"].append(subject) metadata["slice_id"].append( get_slice_id(impath)) metadata["center"].append(subject_data["center"]) metadata["serie"].append(subject_data["serie"]) for meta_name, meta_value in smetadata[subject].items(): metadata.setdefault(meta_name, []).append(meta_value) for impath in subject_data["masks"]: im = skio.imread(impath)[np.newaxis, ...] im[im == 255] = 1 assert set(im.ravel().tolist()).issubset({ 0, 1}) output_dataset.append(im) bar.update(cnt) input_dataset = np.asarray(input_dataset) np.save(input_path, input_dataset) output_dataset = np.asarray(output_dataset) np.save(output_path, output_dataset) df = pd.DataFrame.from_dict(metadata) df.to_csv(desc_path, sep="\t", index=False) return Item(input_path=input_path, output_path=output_path, metadata_path=desc_path, height=height, width=width)

Follow us

© 2019, pynet developers .
Inspired by AZMIND template.