Skip to content

shuffle_csv

CLI module for shuffling CSV data files.

Functions:

load_data_config_from_path

load_data_config_from_path(
    data_path: str, data_config_path: str
) -> DatasetProcessor

Load the data config from a path.

Parameters:

  • data_config_path (str) –

    Path to the data config file.

Returns:

Source code in src/stimulus/cli/shuffle_csv.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def load_data_config_from_path(data_path: str, data_config_path: str) -> data_handlers.DatasetProcessor:
    """Load the data config from a path.

    Args:
        data_config_path: Path to the data config file.

    Returns:
        A tuple of the parsed configuration.
    """
    with open(data_config_path) as file:
        data_config_dict = yaml.safe_load(file)
        data_config_obj = data_config_parser.SplitConfigDict(**data_config_dict)

    splitters = data_config_parser.create_splitter(data_config_obj.split)
    transforms = data_config_parser.create_transforms(data_config_obj.transforms)
    split_columns = data_config_obj.split.split_input_columns
    label_columns = [column.column_name for column in data_config_obj.columns if column.column_type == "label"]

    return data_handlers.DatasetProcessor(
        csv_path=data_path,
        transforms=transforms,
        split_columns=split_columns,
        splitter=splitters,
    ), label_columns

shuffle_csv

shuffle_csv(
    data_csv: str, config_yaml: str, out_path: str
) -> None

Shuffle the data and split it according to the default split method.

Parameters:

  • data_csv (str) –

    Path to input CSV file.

  • config_yaml (str) –

    Path to config YAML file.

  • out_path (str) –

    Path to output shuffled CSV.

Source code in src/stimulus/cli/shuffle_csv.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def shuffle_csv(data_csv: str, config_yaml: str, out_path: str) -> None:
    """Shuffle the data and split it according to the default split method.

    Args:
        data_csv: Path to input CSV file.
        config_yaml: Path to config YAML file.
        out_path: Path to output shuffled CSV.
    """
    # create a DatasetProcessor object from the config and the csv
    processor, label_columns = load_data_config_from_path(data_csv, config_yaml)
    logger.info("Dataset processor initialized successfully.")

    # shuffle the data with a default seed
    # TODO: get the seed from the config if and when that is going to be set there
    processor.shuffle_labels(label_columns, seed=42)
    logger.info("Data shuffled successfully.")

    # save the modified csv
    processor.save(out_path)
    logger.info("Shuffled data saved successfully.")