Source code for qualia_core.dataset.EuroSAT

"""##  EuroSAT.py: A Qualia dataset for land use and land cover classification using Sentinel-2 satellite images.

## Author
# - **Jonathan Courtois**
#   [jonathan.courtois@univ-cotedazur.fr](mailto:jonathan.courtois@univ-cotedazur.fr)
## Dataset Reference
# - **EuroSat Dataset:**
#   https://github.com/phelber/EuroSAT
# - **Installation Instructions:**
#   The EuroSat dataset .zip files must be uncompressed in your 'dataset' folder of the Qualia repository with this structure:
#   .dataset/
#   ├── EuroSAT/
#   ├── ├── MS/
#   ├── ├── ├── [Class_folder]/
#   ├── ├── ├── ├── [*.tif]
#   ├── ├── RGB/
#   ├── ├── ├── [Class_folder]/
#   ├── ├── ├── ├── [*.jpg]
# - **Citation:**
# @article{helber2019eurosat,
#     title={Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification},
#     author={Helber, Patrick and Bischke, Benjamin and Dengel, Andreas and Borth, Damian},
#     journal={IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing},
#     year={2019},
#     publisher={IEEE}
"""

from __future__ import annotations  # annotations: Enables using class names in type hints before they're defined

import json
import logging  # logging: For keeping track of what our dataset is doing
import sys
import time
from pathlib import Path  # Path: Makes file handling consistent across operating systems
from typing import Any

import numpy as np  # numpy: For efficient array operations on our data

from qualia_core import random  # Generator: For generating random numbers, useful for splitting data into training and test sets
from qualia_core.datamodel.RawDataModel import RawData, RawDataModel, RawDataSets
from qualia_core.dataset.RawDataset import RawDataset

if sys.version_info >= (3, 12):
    from typing import override
else:
    from typing_extensions import override

logger: logging.Logger = logging.getLogger(__name__)


[docs]
class EuroSAT(RawDataset):
    """EuroSAT Land Use and Land Cover Classification with Sentinel-2,.

    Challenge of land use and land cover classification using Sentinel-2 satellite images.
    The Sentinel-2 satellite images are openly and freely accessible provided in the Earth observation program Copernicus.
    Sentinel-2 satellite images covering 13 spectral bands.
    10 classes with in total 27,000 labeled and geo-referenced images.
    The paper proposed network that achieved an overall classification accuracy of 98.57%.
    The geo-referenced dataset EuroSAT is made publicly available here [https://github.com/phelber/EuroSAT].
    ---
    10 Classes - 27000 images
    1. Annual Crop              - 3000 images
    2. Forest                   - 3000 images
    3. Herbaceous Vegetation    - 3000 images
    4. Highway                  - 2500 images
    5. Industrial Buildings     - 2500 images
    6. Pasture                  - 2000 images
    7. Permanent Crop           - 2500 images
    8. Residential Buildings    - 3000 images
    9. River                    - 2500 images
    10. Sea and Lake            - 3000 images
    """

    def __init__(self,
                 path: str = '',
                 variant: str = 'MS',
                 dtype: str = 'float32',
                 train_test_ratio: float = 0.8) -> None:
        """Instantiate the EuroSAT dataset loader.

        :param path: Dataset source path
        :param variant: ``'MS'`` (Multi Spectral) or ``'RGB'``, Only MS inmplemented so far.
        :param dtype: Data type for the input vectors
        """
        super().__init__()  # Set up the basic RawDataset structure
        self.__path = Path(path)  # Convert string path to a proper Path object

        if variant == 'MS':
            self.__suffix = 'tif'
            self.__channels = 13
        elif variant == 'RGB':
            self.__suffix = 'jpg'
            self.__channels = 3
        else:
            logger.error("Unsupported variant '%s'. Use 'MS' or 'RGB'.", variant)
            raise ValueError

        self.__variant = variant    # Store which variant we want to use
        self.__dtype = dtype
        self.__train_test_ratio = train_test_ratio
        self.sets.remove('valid')   # Tell Qualia we won't use a validation set

    def _dataset_info(self) -> tuple[dict[str, int], dict[str, int]]:
        """Provide information about the dataset.

        This is like giving a brief overview of what our dataset contains:
        - How many classes (types of things) are there?
        - What are the names of these classes?
        - How many images are in each class?

        This helps us understand what we have before we start using it.
        """
        start = time.time()

        images_path = self.__path / self.__variant
        # get the number of folders, which is the number of classes and the name the names of the classes
        class_names: list[str] = sorted([d.name for d in images_path.iterdir() if d.is_dir()])
        class_idx = {name: idx for idx, name in enumerate(class_names)}

        # for each class, get the number of elements
        class_counts = dict.fromkeys(class_names, 0)
        for class_name in class_names:
            class_path = images_path / class_name
            if not class_path.is_dir():
                logger.warning('Skipping %s, not a directory', class_path)
                continue
            class_counts[class_name] = len(list(class_path.glob(f'*.{self.__suffix}')))
        logger.info('_dataset_info() Elapsed: %s s', time.time() - start)

        return class_counts, class_idx

    def _generate_test_train_split(self, class_counts: dict[str, int]) -> tuple[dict[str, np.ndarray[Any, np.dtype[np.int64]]],
                                                                                dict[str, np.ndarray[Any, np.dtype[np.int64]]]]:
        start = time.time()

        train_idx = {name: np.array([], dtype=np.int64) for name in class_counts}
        test_idx = {name: np.array([], dtype=np.int64) for name in class_counts}

        for class_name, count in class_counts.items():
            test_idx[class_name] = random.shared.generator.choice(
                np.arange(count) + 1,
                size=int(count * (1 - self.__train_test_ratio)),
                replace=False,
            ).tolist()
            train_idx[class_name] = np.setdiff1d(
                np.arange(count)+1,
                test_idx[class_name],
            ).tolist()
        logger.info('Generated test/train split: %s', class_counts)

        # Save the indices for later use
        with Path.open(self.__path / 'test_idx.json', 'w') as f:
            json.dump(test_idx, f, indent='  ')
        with Path.open(self.__path / 'train_idx.json', 'w') as f:
            json.dump(train_idx, f, indent='  ')

        logger.info('_generate_test_train_split() Elapsed: %s s', time.time() - start)
        return train_idx, test_idx

    def __load_data(self, *, class_idx: dict[str, int], set_idx: dict[str, np.ndarray[Any, np.dtype[np.int64]]]) -> RawData:
        """Load and preprocess data files.

        This is where we:
        1. Read our raw data files
        2. Format them how Qualia expects
        3. Make sure values are in the right range

        It's like taking ingredients and preparing them for cooking:
        - Reading the files is like getting ingredients from containers
        - Reshaping is like cutting them to the right size
        - Normalizing is like measuring out the right amounts
        """
        import imageio

        start = time.time()

        train_x_list: list[np.ndarray[Any, np.dtype[np.uint16]]] = []
        train_y_list: list[int] = []

        for class_name, indices in set_idx.items():
            class_path = self.__path / self.__variant / class_name
            if not class_path.is_dir():
                logger.warning('Skipping %s, not a directory', class_path)
                continue
            for idx in indices:
                filepath = class_path / f'{class_name}_{idx:d}.{self.__suffix}'
                if not filepath.is_file():
                    logger.warning('Skipping %s, not a file', filepath)
                    continue

                data = imageio.v3.imread(filepath)

                train_x_list.append(data)
                train_y_list.append(class_idx[class_name])  # Use the class index for labels

        # Convert lists to numpy arrays
        train_x_uint16 = np.array(train_x_list, dtype=np.uint16)

        train_x_uint16 = train_x_uint16.reshape((train_x_uint16.shape[0], 64, 64, self.__channels))

        train_x = train_x_uint16.astype(self.__dtype) # N, H, W, C
        train_y = np.array(train_y_list, dtype=np.int64)  # Convert labels to numpy array
        logger.info('__load_train() Elapsed: %s s', time.time() - start)
        return RawData(train_x, train_y)

    @override
    def __call__(self) -> RawDataModel:
        """Load and prepare the complete dataset.

        This is our main kitchen where we:
        1. Load all our data
        2. Organize it into training and test sets
        3. Package it in Qualia's preferred containers
        4. Add helpful information for debugging
        """
        logger.info('Loading EuroSAT dataset from %s', self.__path)

        class_counts, class_idx = self._dataset_info()

        if (self.__path/'test_idx.json').exists() and (self.__path/'train_idx.json').exists():
            logger.info('Test/train split already exists, loading from files.')
            with (self.__path/'train_idx.json').open() as f:
                train_idx = json.load(f)
            with (self.__path/'test_idx.json').open() as f:
                test_idx = json.load(f)
        else:
            train_idx, test_idx = self._generate_test_train_split(class_counts=class_counts)

        # Package everything in Qualia's containers
        return RawDataModel(
            sets=RawDataSets(
                train=self.__load_data(class_idx=class_idx, set_idx=train_idx),
                test=self.__load_data(class_idx=class_idx, set_idx=test_idx),
            ),
            name=self.name,
        )