Skip to content

data_loading

Module for dataset loading utilities.

Functions:

load_dataset_from_path

load_dataset_from_path(data_path: str) -> DatasetDict

Load dataset from various formats.

Parameters:

  • data_path (str) –

    Path to the dataset (CSV, parquet, or HuggingFace dataset directory)

Returns:

  • DatasetDict

    A DatasetDict containing the loaded dataset

Source code in src/stimulus/data/interface/data_loading.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def load_dataset_from_path(data_path: str) -> datasets.DatasetDict:
    """Load dataset from various formats.

    Args:
        data_path: Path to the dataset (CSV, parquet, or HuggingFace dataset directory)

    Returns:
        A DatasetDict containing the loaded dataset
    """
    # Check if it's a directory (HuggingFace dataset)
    if os.path.isdir(data_path):
        logger.info(f"Loading dataset from directory: {data_path}")
        return datasets.load_from_disk(data_path)

    # Try to load as parquet first, then CSV
    try:
        logger.info(f"Attempting to load as parquet: {data_path}")
        dataset = datasets.load_dataset("parquet", data_files=data_path)
    except pa.ArrowInvalid:
        logger.info("Data is not in parquet format, trying CSV")
        dataset = datasets.load_dataset("csv", data_files=data_path)

    return dataset