Skip to content

data_config_parser

Module for parsing data configs.

Functions:

create_encoders

create_encoders(
    column_config: list[Columns],
) -> dict[str, AbstractEncoder]

Factory for creating encoders from config.

Source code in src/stimulus/data/interface/data_config_parser.py
37
38
39
40
41
42
43
44
45
46
def create_encoders(column_config: list[Columns]) -> dict[str, encoders_module.AbstractEncoder]:
    """Factory for creating encoders from config."""
    return {
        column.column_name: _instantiate_component(
            module=encoders_module,
            name=column.encoder[0].name,
            params=column.encoder[0].params,
        )
        for column in column_config
    }

create_splitter

create_splitter(split_config: Split) -> AbstractSplitter

Factory for creating splitters from config.

Source code in src/stimulus/data/interface/data_config_parser.py
72
73
74
75
76
77
78
def create_splitter(split_config: Split) -> splitters_module.AbstractSplitter:
    """Factory for creating splitters from config."""
    return _instantiate_component(
        module=splitters_module,
        name=split_config.split_method,
        params=split_config.params,
    )

create_transforms

create_transforms(
    transform_config: list[Transform],
) -> dict[str, list[Any]]

Factory for creating transforms from config.

Parameters:

  • transform_config (list[Transform]) –

    List of Transform objects from the YAML config

Returns:

  • dict[str, list[Any]]

    Dictionary mapping column names to lists of instantiated transform objects

Source code in src/stimulus/data/interface/data_config_parser.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def create_transforms(transform_config: list[Transform]) -> dict[str, list[Any]]:
    """Factory for creating transforms from config.

    Args:
        transform_config: List of Transform objects from the YAML config

    Returns:
        Dictionary mapping column names to lists of instantiated transform objects
    """
    transforms = {}
    for transform in transform_config:
        for column in transform.columns:
            transforms[column.column_name] = [
                _instantiate_component(
                    module=transforms_module,
                    name=transformation.name,
                    params=transformation.params,
                )
                for transformation in column.transformations
            ]
    return transforms

dump_yaml_list_into_files

dump_yaml_list_into_files(
    yaml_list: list[SplitConfigDict],
    directory_path: str,
    base_name: str,
    len_simple_numeric: int = 5,
) -> None

Dumps YAML configurations to files with consistent, readable formatting.

Source code in src/stimulus/data/interface/data_config_parser.py
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def dump_yaml_list_into_files(
    yaml_list: list[SplitConfigDict],
    directory_path: str,
    base_name: str,
    len_simple_numeric: int = 5,
) -> None:
    """Dumps YAML configurations to files with consistent, readable formatting."""

    def represent_dict(dumper: yaml.SafeDumper, data: dict) -> Any:
        """Custom representer for dictionaries to ensure block style."""
        return dumper.represent_mapping("tag:yaml.org,2002:map", data.items(), flow_style=False)

    def represent_list(dumper: yaml.SafeDumper, data: list) -> Any:
        """Custom representer for lists to control flow style based on content."""
        # Use flow style only for simple numeric lists like split ratios
        is_simple_numeric = all(isinstance(i, (int, float)) for i in data) and len(data) <= len_simple_numeric
        return dumper.represent_sequence("tag:yaml.org,2002:seq", data, flow_style=is_simple_numeric)

    # Create a dumper that preserves the document structure
    class ReadableDumper(yaml.SafeDumper):
        def ignore_aliases(self, _data: Any) -> bool:
            return True  # Disable anchor/alias generation

    # Register our custom representers
    ReadableDumper.add_representer(dict, represent_dict)
    ReadableDumper.add_representer(list, represent_list)
    ReadableDumper.add_representer(type(None), lambda d, _: d.represent_scalar("tag:yaml.org,2002:null", ""))

    for i, yaml_dict in enumerate(yaml_list):
        data = _clean_params(yaml_dict.model_dump(exclude_none=True))

        with open(f"{directory_path}/{base_name}_{i}.yaml", "w") as f:
            yaml.dump(
                data,
                f,
                Dumper=ReadableDumper,
                default_flow_style=False,  # Default to block style for readability
                sort_keys=False,
                indent=2,
                width=80,  # Set reasonable line width
                explicit_start=False,
                explicit_end=False,
            )

expand_transform_list_combinations

expand_transform_list_combinations(
    transform_list: list[Transform],
) -> list[Transform]

Expands a list of transforms into all possible parameter combinations.

Takes a list of transforms where each transform may contain parameter lists, and expands them into separate transforms with single parameter values. For example, if a transform has parameters [0.1, 0.2] and [1, 2], this will create two transforms: one with 0.1/1 and another with 0.2/2.

Parameters:

  • transform_list (list[Transform]) –

    A list of YamlTransform objects containing parameter lists that need to be expanded into individual transforms.

Returns:

  • list[Transform]

    list[YamlTransform]: A flattened list of transforms where each transform has single parameter values instead of parameter lists. The length of the returned list will be the sum of the number of parameter combinations for each input transform.

Source code in src/stimulus/data/interface/data_config_parser.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def expand_transform_list_combinations(
    transform_list: list[Transform],
) -> list[Transform]:
    """Expands a list of transforms into all possible parameter combinations.

    Takes a list of transforms where each transform may contain parameter lists,
    and expands them into separate transforms with single parameter values.
    For example, if a transform has parameters [0.1, 0.2] and [1, 2], this will
    create two transforms: one with 0.1/1 and another with 0.2/2.

    Args:
        transform_list: A list of YamlTransform objects containing parameter lists
            that need to be expanded into individual transforms.

    Returns:
        list[YamlTransform]: A flattened list of transforms where each transform
            has single parameter values instead of parameter lists. The length of
            the returned list will be the sum of the number of parameter combinations
            for each input transform.
    """
    sub_transforms = []
    for transform in transform_list:
        sub_transforms.extend(expand_transform_parameter_combinations(transform))
    return sub_transforms

expand_transform_parameter_combinations

expand_transform_parameter_combinations(
    transform: Transform,
) -> list[Transform]

Get all possible transforms by extracting parameters at each valid index.

For a transform with parameter lists, creates multiple new transforms, each containing single parameter values from the corresponding indices of the parameter lists.

Parameters:

  • transform (Transform) –

    The original transform containing parameter lists

Returns:

  • list[Transform]

    A list of transforms, each with single parameter values from sequential indices

Source code in src/stimulus/data/interface/data_config_parser.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def expand_transform_parameter_combinations(
    transform: Transform,
) -> list[Transform]:
    """Get all possible transforms by extracting parameters at each valid index.

    For a transform with parameter lists, creates multiple new transforms, each containing
    single parameter values from the corresponding indices of the parameter lists.

    Args:
        transform: The original transform containing parameter lists

    Returns:
        A list of transforms, each with single parameter values from sequential indices
    """
    # Find the length of parameter lists - we only need to check the first list we find
    # since all lists must have the same length (enforced by pydantic validator)
    max_length = 1
    for column in transform.columns:
        for transformation in column.transformations:
            if transformation.params:
                list_lengths = [len(v) for v in transformation.params.values() if isinstance(v, list) and len(v) > 1]
                if list_lengths:
                    max_length = list_lengths[0]  # All lists have same length due to validator
                    break

    # Generate a transform for each index
    transforms = []
    for i in range(max_length):
        transforms.append(extract_transform_parameters_at_index(transform, i))

    return transforms

extract_transform_parameters_at_index

extract_transform_parameters_at_index(
    transform: Transform, index: int = 0
) -> Transform

Get a transform with parameters at the specified index.

Parameters:

  • transform (Transform) –

    The original transform containing parameter lists

  • index (int, default: 0 ) –

    Index to extract parameters from (default 0)

Returns:

  • Transform

    A new transform with single parameter values at the specified index

Source code in src/stimulus/data/interface/data_config_parser.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def extract_transform_parameters_at_index(
    transform: Transform,
    index: int = 0,
) -> Transform:
    """Get a transform with parameters at the specified index.

    Args:
        transform: The original transform containing parameter lists
        index: Index to extract parameters from (default 0)

    Returns:
        A new transform with single parameter values at the specified index
    """
    # Create a copy of the transform
    new_transform = Transform(**transform.model_dump())

    # Process each column and transformation
    for column in new_transform.columns:
        for transformation in column.transformations:
            if transformation.params:
                # Convert each parameter list to single value at index
                new_params = {}
                for param_name, param_value in transformation.params.items():
                    if isinstance(param_value, list):
                        new_params[param_name] = param_value[index]
                    else:
                        new_params[param_name] = param_value
                transformation.params = new_params

    return new_transform

generate_split_configs

generate_split_configs(
    config: ConfigDict,
) -> list[SplitConfigDict]

Generates all possible split configuration from a YAML config.

Takes a YAML configuration that may contain parameter lists and splits, and generates all unique splits into separate data configurations.

For example, if the config has: - Two transforms with parameters [0.1, 0.2], [0.3, 0.4] - Two splits [0.7/0.3] and [0.8/0.2] This will generate 2 configs, 2 for each split. config_1: transform: [[0.1, 0.2], [0.3, 0.4]] split: [0.7, 0.3]

config_2:
    transform: [[0.1, 0.2], [0.3, 0.4]]
    split: [0.8, 0.2]

Parameters:

  • config (ConfigDict) –

    The source YAML configuration containing transforms with parameter lists and multiple splits.

Returns:

  • list[SplitConfigDict]

    list[SplitConfigDict]: A list of data configurations, where each config has a list of parameters and one split configuration. The length will be the product of the number of parameter combinations and the number of splits.

Source code in src/stimulus/data/interface/data_config_parser.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
def generate_split_configs(config: ConfigDict) -> list[SplitConfigDict]:
    """Generates all possible split configuration from a YAML config.

    Takes a YAML configuration that may contain parameter lists and splits,
    and generates all unique splits into separate data configurations.

    For example, if the config has:
    - Two transforms with parameters [0.1, 0.2], [0.3, 0.4]
    - Two splits [0.7/0.3] and [0.8/0.2]
    This will generate 2 configs, 2 for each split.
        config_1:
            transform: [[0.1, 0.2], [0.3, 0.4]]
            split: [0.7, 0.3]

        config_2:
            transform: [[0.1, 0.2], [0.3, 0.4]]
            split: [0.8, 0.2]

    Args:
        config: The source YAML configuration containing transforms with
            parameter lists and multiple splits.

    Returns:
        list[SplitConfigDict]: A list of data configurations, where each
            config has a list of parameters and one split configuration. The
            length will be the product of the number of parameter combinations
            and the number of splits.
    """
    if isinstance(config, dict) and not isinstance(config, ConfigDict):
        raise TypeError("Input must be a ConfigDict object")

    sub_splits = config.split
    sub_configs = []
    for split in sub_splits:
        sub_configs.append(
            SplitConfigDict(
                global_params=config.global_params,
                columns=config.columns,
                transforms=config.transforms,
                split=split,
            ),
        )
    return sub_configs

generate_split_transform_configs

generate_split_transform_configs(
    config: SplitConfigDict,
) -> list[SplitTransformDict]

Generates all the transform configuration for a given split.

Takes a YAML configuration that may contain a transform or a list of transform, and generates all unique transform for a split into separate data configurations.

For example, if the config has: - Two transforms with parameters [0.1, 0.2], [0.3, 0.4] - A split [0.7, 0.3] This will generate 2 configs, 2 for each split. transform_config_1: transform: [0.1, 0.2] split: [0.7, 0.3]

transform_config_2:
    transform: [0.3, 0.4]
    split: [0.7, 0.3]

Parameters:

  • config (SplitConfigDict) –

    The source YAML configuration containing each a split with transforms with parameters lists

Returns:

  • list[SplitTransformDict]

    list[SplitTransformDict]: A list of data configurations, where each config has a list of parameters and one split configuration. The length will be the product of the number of parameter combinations and the number of splits.

Source code in src/stimulus/data/interface/data_config_parser.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def generate_split_transform_configs(
    config: SplitConfigDict,
) -> list[SplitTransformDict]:
    """Generates all the transform configuration for a given split.

    Takes a YAML configuration that may contain a transform or a list of transform,
    and generates all unique transform for a split into separate data configurations.

    For example, if the config has:
    - Two transforms with parameters [0.1, 0.2], [0.3, 0.4]
    - A split [0.7, 0.3]
    This will generate 2 configs, 2 for each split.
        transform_config_1:
            transform: [0.1, 0.2]
            split: [0.7, 0.3]

        transform_config_2:
            transform: [0.3, 0.4]
            split: [0.7, 0.3]

    Args:
        config: The source YAML configuration containing each
            a split with transforms with parameters lists

    Returns:
        list[SplitTransformDict]: A list of data configurations, where each
            config has a list of parameters and one split configuration. The
            length will be the product of the number of parameter combinations
            and the number of splits.
    """
    if isinstance(config, dict) and not isinstance(
        config,
        SplitConfigDict,
    ):
        raise TypeError("Input must be a list of SplitConfigDict")

    sub_transforms = expand_transform_list_combinations(config.transforms)
    split_transform_config: list[SplitTransformDict] = []
    for transform in sub_transforms:
        split_transform_config.append(
            SplitTransformDict(
                global_params=config.global_params,
                columns=config.columns,
                transforms=transform,
                split=config.split,
            ),
        )
    return split_transform_config

parse_split_transform_config

parse_split_transform_config(
    config: SplitTransformDict,
) -> tuple[
    dict[str, AbstractEncoder],
    list[str],
    list[str],
    list[str],
]

Parse the configuration and return a dictionary of the parsed configuration.

Parameters:

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def parse_split_transform_config(
    config: SplitTransformDict,
) -> tuple[
    dict[str, encoders_module.AbstractEncoder],
    list[str],
    list[str],
    list[str],
]:
    """Parse the configuration and return a dictionary of the parsed configuration.

    Args:
        config: The configuration to parse.

    Returns:
        A tuple of the parsed configuration.
    """
    encoders = create_encoders(config.columns)
    input_columns = [column.column_name for column in config.columns if column.column_type == "input"]
    label_columns = [column.column_name for column in config.columns if column.column_type == "label"]
    meta_columns = [column.column_name for column in config.columns if column.column_type == "meta"]

    return encoders, input_columns, label_columns, meta_columns