Source code for torchio.datasets.rsna_miccai

import csv
import warnings
from collections.abc import Sequence
from pathlib import Path

from ..data import ScalarImage
from ..data import Subject
from ..data import SubjectsDataset
from ..types import TypePath



[docs]
class RSNAMICCAI(SubjectsDataset):
    """RSNA-MICCAI Brain Tumor Radiogenomic Classification challenge dataset.

    This is a helper class for the dataset used in the
    `RSNA-MICCAI Brain Tumor Radiogenomic Classification challenge`_ hosted on
    `kaggle <https://www.kaggle.com/>`_. The dataset must be downloaded before
    instantiating this class (as opposed to, e.g., :class:`torchio.datasets.IXI`).

    This `kaggle kernel <https://www.kaggle.com/fepegar/preprocessing-mri-with-torchio/>`_
    includes a usage example including preprocessing of all the scans.

    If you reference or use the dataset in any form, include the following
    citation:

    U.Baid, et al., "The RSNA-ASNR-MICCAI BraTS 2021 Benchmark on Brain Tumor
    Segmentation and Radiogenomic Classification", arXiv:2107.02314, 2021.

    Args:
        root_dir: Directory containing the dataset (``train`` directory,
            ``test`` directory, etc.).
        train: If ``True``, the ``train`` set will be used. Otherwise the
            ``test`` set will be used.
        ignore_empty: If ``True``, the three subjects flagged as "presenting
            issues" (empty images) by the challenge organizers will be ignored.
            The subject IDs are ``00109``, ``00123`` and ``00709``.

    Example:
        >>> import torchio as tio
        >>> from subprocess import call
        >>> call('kaggle competitions download -c rsna-miccai-brain-tumor-radiogenomic-classification'.split())
        >>> root_dir = 'rsna-miccai-brain-tumor-radiogenomic-classification'
        >>> train_set = tio.datasets.RSNAMICCAI(root_dir, train=True)
        >>> test_set = tio.datasets.RSNAMICCAI(root_dir, train=False)
        >>> len(train_set), len(test_set)
        (582, 87)


    .. _RSNA-MICCAI Brain Tumor Radiogenomic Classification challenge: https://www.kaggle.com/c/rsna-miccai-brain-tumor-radiogenomic-classification
    """

    id_key = 'BraTS21ID'
    label_key = 'MGMT_value'
    bad_subjects = '00109', '00123', '00709'

    def __init__(
        self,
        root_dir: TypePath,
        train: bool = True,
        ignore_empty: bool = True,
        modalities: Sequence[str] = ('T1w', 'T1wCE', 'T2w', 'FLAIR'),
        **kwargs,
    ):
        self.root_dir = Path(root_dir).expanduser().resolve()
        if isinstance(modalities, str):
            modalities = [modalities]
        self.modalities = modalities
        subjects = self._get_subjects(self.root_dir, train, ignore_empty)
        super().__init__(subjects, **kwargs)
        self.train = train

    def _get_subjects(
        self,
        root_dir: Path,
        train: bool,
        ignore_empty: bool,
    ) -> list[Subject]:
        subjects = []
        if train:
            csv_path = root_dir / 'train_labels.csv'
            try:
                with open(csv_path) as csvfile:
                    reader = csv.DictReader(csvfile)
                    labels_dict = {
                        row[self.id_key]: int(row[self.label_key]) for row in reader
                    }
            except FileNotFoundError:
                warnings.warn(
                    'Labels CSV not found. Ignoring MGMT labels',
                    stacklevel=2,
                )
                labels_dict = {}
            subjects_dir = root_dir / 'train'
        else:
            subjects_dir = root_dir / 'test'

        for subject_dir in sorted(subjects_dir.iterdir()):
            subject_id = subject_dir.name
            if ignore_empty and subject_id in self.bad_subjects:
                continue
            try:
                int(subject_id)
            except ValueError:
                continue
            images_dict: dict[str, str | int | ScalarImage]
            images_dict = {self.id_key: subject_dir.name}
            if train and labels_dict:
                images_dict[self.label_key] = labels_dict[subject_id]
            for modality in self.modalities:
                image_dir = subject_dir / modality
                filepaths = list(image_dir.iterdir())
                num_files = len(filepaths)
                path = filepaths[0] if num_files == 1 else image_dir
                images_dict[modality] = ScalarImage(path)
            subject = Subject(images_dict)
            subjects.append(subject)
        return subjects