Skip to content

data_config_parser

Module for parsing data configs.

Functions:

create_encoders

create_encoders(
    column_config: list[Columns],
) -> dict[str, AbstractEncoder]

Factory for creating encoders from config.

Source code in src/stimulus/data/interface/data_config_parser.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def create_encoders(column_config: list[Columns]) -> dict[str, encoders_module.AbstractEncoder]:
    """Factory for creating encoders from config."""

    def get_params(params: dict) -> dict:
        """Get the params with the dtype string converted to numpy dtype."""
        try:
            params_new = copy.deepcopy(params)
            dtype_str = params["dtype"]
            params_new["dtype"] = getattr(np, dtype_str)
        except AttributeError as e:
            raise ValueError(f"Invalid dtype {dtype_str} in encoder params") from e
        return params_new

    return {
        column.column_name: _instantiate_component(
            module=encoders_module,
            name=column.encoder[0].name,
            params=get_params(column.encoder[0].params),
        )
        for column in column_config
    }

create_splitter

create_splitter(split_config: Split) -> AbstractSplitter

Factory for creating splitters from config.

Source code in src/stimulus/data/interface/data_config_parser.py
180
181
182
183
184
185
186
def create_splitter(split_config: Split) -> splitters_module.AbstractSplitter:
    """Factory for creating splitters from config."""
    return _instantiate_component(
        module=splitters_module,
        name=split_config.split_method,
        params=split_config.params,
    )

create_transforms

create_transforms(
    transform_config: list[Transform],
) -> dict[str, list[Any]]

Factory for creating transforms from config.

Parameters:

  • transform_config (list[Transform]) –

    List of Transform objects from the YAML config

Returns:

  • dict[str, list[Any]]

    Dictionary mapping column names to lists of instantiated transform objects

Source code in src/stimulus/data/interface/data_config_parser.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def create_transforms(transform_config: list[Transform]) -> dict[str, list[Any]]:
    """Factory for creating transforms from config.

    Args:
        transform_config: List of Transform objects from the YAML config

    Returns:
        Dictionary mapping column names to lists of instantiated transform objects
    """
    transforms = {}
    for transform in transform_config:
        for column in transform.columns:
            transforms[column.column_name] = [
                _instantiate_component(
                    module=transforms_module,
                    name=transformation.name,
                    params=transformation.params,
                )
                for transformation in column.transformations
            ]
    return transforms

expand_transform_list_combinations

expand_transform_list_combinations(
    transform_list: list[Transform],
) -> list[Transform]

Expands a list of transforms into all possible parameter combinations.

Takes a list of transforms where each transform may contain parameter lists, and expands them into separate transforms with single parameter values. For example, if a transform has parameters [0.1, 0.2] and [1, 2], this will create two transforms: one with 0.1/1 and another with 0.2/2.

Parameters:

  • transform_list (list[Transform]) –

    A list of YamlTransform objects containing parameter lists that need to be expanded into individual transforms.

Returns:

  • list[Transform]

    list[YamlTransform]: A flattened list of transforms where each transform has single parameter values instead of parameter lists. The length of the returned list will be the sum of the number of parameter combinations for each input transform.

Source code in src/stimulus/data/interface/data_config_parser.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def expand_transform_list_combinations(
    transform_list: list[Transform],
) -> list[Transform]:
    """Expands a list of transforms into all possible parameter combinations.

    Takes a list of transforms where each transform may contain parameter lists,
    and expands them into separate transforms with single parameter values.
    For example, if a transform has parameters [0.1, 0.2] and [1, 2], this will
    create two transforms: one with 0.1/1 and another with 0.2/2.

    Args:
        transform_list: A list of YamlTransform objects containing parameter lists
            that need to be expanded into individual transforms.

    Returns:
        list[YamlTransform]: A flattened list of transforms where each transform
            has single parameter values instead of parameter lists. The length of
            the returned list will be the sum of the number of parameter combinations
            for each input transform.
    """
    sub_transforms = []
    for transform in transform_list:
        sub_transforms.extend(expand_transform_parameter_combinations(transform))
    return sub_transforms

expand_transform_parameter_combinations

expand_transform_parameter_combinations(
    transform: Transform,
) -> list[Transform]

Get all possible transforms by extracting parameters at each valid index.

For a transform with parameter lists, creates multiple new transforms, each containing single parameter values from the corresponding indices of the parameter lists.

Parameters:

  • transform (Transform) –

    The original transform containing parameter lists

Returns:

  • list[Transform]

    A list of transforms, each with single parameter values from sequential indices

Source code in src/stimulus/data/interface/data_config_parser.py
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
def expand_transform_parameter_combinations(
    transform: Transform,
) -> list[Transform]:
    """Get all possible transforms by extracting parameters at each valid index.

    For a transform with parameter lists, creates multiple new transforms, each containing
    single parameter values from the corresponding indices of the parameter lists.

    Args:
        transform: The original transform containing parameter lists

    Returns:
        A list of transforms, each with single parameter values from sequential indices
    """
    # Find the length of parameter lists - we only need to check the first list we find
    # since all lists must have the same length (enforced by pydantic validator)
    max_length = 1
    for column in transform.columns:
        for transformation in column.transformations:
            if transformation.params:
                list_lengths = [len(v) for v in transformation.params.values() if isinstance(v, list) and len(v) > 1]
                if list_lengths:
                    max_length = list_lengths[0]  # All lists have same length due to validator
                    break

    # Generate a transform for each index
    transforms = []
    for i in range(max_length):
        transforms.append(extract_transform_parameters_at_index(transform, i))

    return transforms

extract_transform_parameters_at_index

extract_transform_parameters_at_index(
    transform: Transform, index: int = 0
) -> Transform

Get a transform with parameters at the specified index.

Parameters:

  • transform (Transform) –

    The original transform containing parameter lists

  • index (int, default: 0 ) –

    Index to extract parameters from (default 0)

Returns:

  • Transform

    A new transform with single parameter values at the specified index

Source code in src/stimulus/data/interface/data_config_parser.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def extract_transform_parameters_at_index(
    transform: Transform,
    index: int = 0,
) -> Transform:
    """Get a transform with parameters at the specified index.

    Args:
        transform: The original transform containing parameter lists
        index: Index to extract parameters from (default 0)

    Returns:
        A new transform with single parameter values at the specified index
    """
    # Create a copy of the transform
    new_transform = Transform(**transform.model_dump())

    # Process each column and transformation
    for column in new_transform.columns:
        for transformation in column.transformations:
            if transformation.params:
                # Convert each parameter list to single value at index
                new_params = {}
                for param_name, param_value in transformation.params.items():
                    if isinstance(param_value, list):
                        new_params[param_name] = param_value[index]
                    else:
                        new_params[param_name] = param_value
                transformation.params = new_params

    return new_transform

generate_encoding_config

generate_encoding_config(
    config: ConfigDict,
) -> EncodingConfigDict

Generate encoding-only configuration from a master config.

Parameters:

  • config (ConfigDict) –

    The master configuration containing all components.

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
371
372
373
374
375
376
377
378
379
380
381
382
383
def generate_encoding_config(config: ConfigDict) -> EncodingConfigDict:
    """Generate encoding-only configuration from a master config.

    Args:
        config: The master configuration containing all components.

    Returns:
        EncodingConfigDict containing only global_params and columns.
    """
    return EncodingConfigDict(
        global_params=config.global_params,
        columns=config.columns,
    )

generate_individual_split_configs

generate_individual_split_configs(
    config: ConfigDict,
) -> list[IndividualSplitConfigDict]

Generate individual split configurations from a master config.

Parameters:

  • config (ConfigDict) –

    The master configuration containing multiple splits.

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
def generate_individual_split_configs(config: ConfigDict) -> list[IndividualSplitConfigDict]:
    """Generate individual split configurations from a master config.

    Args:
        config: The master configuration containing multiple splits.

    Returns:
        List of IndividualSplitConfigDict, one for each split in the master config.
    """
    return [
        IndividualSplitConfigDict(
            global_params=config.global_params,
            split=split,
        )
        for split in config.split
    ]

generate_individual_transform_configs

generate_individual_transform_configs(
    config: ConfigDict,
) -> list[IndividualTransformConfigDict]

Generate individual transform configurations from a master config.

Expands parameter lists within transforms to create separate configs for each parameter combination.

Parameters:

  • config (ConfigDict) –

    The master configuration containing transforms with parameter lists.

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
def generate_individual_transform_configs(config: ConfigDict) -> list[IndividualTransformConfigDict]:
    """Generate individual transform configurations from a master config.

    Expands parameter lists within transforms to create separate configs for each
    parameter combination.

    Args:
        config: The master configuration containing transforms with parameter lists.

    Returns:
        List of IndividualTransformConfigDict, one for each expanded transform.
    """
    # Expand all transforms to handle parameter lists
    expanded_transforms = expand_transform_list_combinations(config.transforms)

    return [
        IndividualTransformConfigDict(
            global_params=config.global_params,
            transforms=transform,
        )
        for transform in expanded_transforms
    ]

parse_encoding_config

parse_encoding_config(
    config: EncodingConfigDict,
) -> tuple[
    dict[str, AbstractEncoder],
    list[str],
    list[str],
    list[str],
]

Parse encoding-only configuration.

Parameters:

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def parse_encoding_config(
    config: EncodingConfigDict,
) -> tuple[
    dict[str, encoders_module.AbstractEncoder],
    list[str],
    list[str],
    list[str],
]:
    """Parse encoding-only configuration.

    Args:
        config: The encoding configuration to parse.

    Returns:
        A tuple of encoders and column lists.
    """
    encoders = create_encoders(config.columns)
    input_columns = [column.column_name for column in config.columns if column.column_type == "input"]
    label_columns = [column.column_name for column in config.columns if column.column_type == "label"]
    meta_columns = [column.column_name for column in config.columns if column.column_type == "meta"]

    return encoders, input_columns, label_columns, meta_columns

parse_individual_split_config

parse_individual_split_config(
    config: IndividualSplitConfigDict,
) -> tuple[AbstractSplitter, list[str]]

Parse individual split configuration.

Parameters:

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
249
250
251
252
253
254
255
256
257
258
259
260
def parse_individual_split_config(
    config: IndividualSplitConfigDict,
) -> tuple[splitters_module.AbstractSplitter, list[str]]:
    """Parse individual split configuration.

    Args:
        config: The individual split configuration to parse.

    Returns:
        A tuple containing the splitter instance and split input columns.
    """
    return create_splitter(config.split), config.split.split_input_columns

parse_individual_transform_config

parse_individual_transform_config(
    config: IndividualTransformConfigDict,
) -> dict[str, list[Any]]

Parse individual transform configuration.

Parameters:

Returns:

  • dict[str, list[Any]]

    Dictionary mapping column names to lists of transform objects.

Source code in src/stimulus/data/interface/data_config_parser.py
237
238
239
240
241
242
243
244
245
246
def parse_individual_transform_config(config: IndividualTransformConfigDict) -> dict[str, list[Any]]:
    """Parse individual transform configuration.

    Args:
        config: The individual transform configuration to parse.

    Returns:
        Dictionary mapping column names to lists of transform objects.
    """
    return create_transforms([config.transforms])

parse_split_transform_config

parse_split_transform_config(
    config: SplitTransformDict,
) -> tuple[
    dict[str, AbstractEncoder],
    list[str],
    list[str],
    list[str],
]

Parse the configuration and return a dictionary of the parsed configuration.

Parameters:

Returns:

Source code in src/stimulus/data/interface/data_config_parser.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def parse_split_transform_config(
    config: SplitTransformDict,
) -> tuple[
    dict[str, encoders_module.AbstractEncoder],
    list[str],
    list[str],
    list[str],
]:
    """Parse the configuration and return a dictionary of the parsed configuration.

    Args:
        config: The configuration to parse.

    Returns:
        A tuple of the parsed configuration.
    """
    encoders = create_encoders(config.columns)
    input_columns = [column.column_name for column in config.columns if column.column_type == "input"]
    label_columns = [column.column_name for column in config.columns if column.column_type == "label"]
    meta_columns = [column.column_name for column in config.columns if column.column_type == "meta"]

    return encoders, input_columns, label_columns, meta_columns

split_config_into_components

split_config_into_components(
    config: ConfigDict,
    output_dir: str,
    base_name: str = "config",
) -> None

Split a master config into separate component configs and save them.

Parameters:

  • config (ConfigDict) –

    The master configuration to split.

  • output_dir (str) –

    Directory to save the component configs.

  • base_name (str, default: 'config' ) –

    Base name for generated files (default: "config").

Source code in src/stimulus/data/interface/data_config_parser.py
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
def split_config_into_components(config: ConfigDict, output_dir: str, base_name: str = "config") -> None:
    """Split a master config into separate component configs and save them.

    Args:
        config: The master configuration to split.
        output_dir: Directory to save the component configs.
        base_name: Base name for generated files (default: "config").
    """
    # Generate encoding config
    encoding_config = generate_encoding_config(config)
    encoding_data = _clean_params(encoding_config.model_dump(exclude_none=True))

    # Generate split configs
    split_configs = generate_individual_split_configs(config)

    # Generate transform configs
    transform_configs = generate_individual_transform_configs(config)

    # Save encoding config
    encoding_filename = f"{base_name}_encode.yaml"
    _save_single_yaml(encoding_data, f"{output_dir}/{encoding_filename}")

    # Save split configs
    for split_config in split_configs:
        split_data = _clean_params(split_config.model_dump(exclude_none=True))
        split_filename = _generate_split_filename(base_name, split_config.split)
        _save_single_yaml(split_data, f"{output_dir}/{split_filename}")

    # Save transform configs
    for transform_config in transform_configs:
        transform_data = _clean_params(transform_config.model_dump(exclude_none=True))
        transform_filename = _generate_transform_filename(base_name, transform_config.transforms)
        _save_single_yaml(transform_data, f"{output_dir}/{transform_filename}")