Skip to content

split_yaml

CLI module for splitting YAML configuration files into component configs.

This module provides functionality to split a single YAML configuration file into separate component files: encoding config, individual split configs, and individual transform configs. The resulting YAML files can be used independently.

Functions:

  • split_yaml

    Split a YAML config file into separate component configs.

split_yaml

split_yaml(config_yaml: str, out_dir_path: str) -> None

Split a YAML config file into separate component configs.

Takes a master YAML configuration and splits it into: - encode.yaml: Contains encoding configuration (global_params + columns) - split1.yaml, split2.yaml, etc.: Individual split configurations - transform1.yaml, transform2.yaml, etc.: Individual transform configurations with parameter expansion

Parameters:

  • config_yaml (str) –

    Path to the master YAML configuration file.

  • out_dir_path (str) –

    Output directory to save the component config files.

Raises:

Source code in src/stimulus/cli/split_yaml.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def split_yaml(config_yaml: str, out_dir_path: str) -> None:
    """Split a YAML config file into separate component configs.

    Takes a master YAML configuration and splits it into:
    - encode.yaml: Contains encoding configuration (global_params + columns)
    - split1.yaml, split2.yaml, etc.: Individual split configurations
    - transform1.yaml, transform2.yaml, etc.: Individual transform configurations
      with parameter expansion

    Args:
        config_yaml: Path to the master YAML configuration file.
        out_dir_path: Output directory to save the component config files.

    Raises:
        FileNotFoundError: If the config file doesn't exist.
        ValueError: If the YAML config is invalid or malformed.
    """
    # Validate input file exists
    config_path = Path(config_yaml)
    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_yaml}")

    # Create output directory if it doesn't exist
    output_dir = Path(out_dir_path)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Load and validate the YAML config
    yaml_config: dict[str, Any] = {}
    try:
        with open(config_yaml) as conf_file:
            yaml_config = yaml.safe_load(conf_file)
    except yaml.YAMLError as e:
        raise ValueError(f"Invalid YAML format in {config_yaml}: {e}") from e

    # Validate config structure
    try:
        config_dict = ConfigDict(**yaml_config)
    except Exception as e:
        raise ValueError(f"Invalid config structure in {config_yaml}: {e}") from e

    logger.info("YAML config loaded and validated successfully.")

    # Extract base name from config file path
    base_name = config_path.stem

    # Split the config into components
    split_config_into_components(config_dict, str(output_dir), base_name)

    # Count generated files for logging
    encoding_files = 1  # encode.yaml
    split_files = len(config_dict.split)
    transform_files = (
        sum(
            len(
                [
                    p
                    for p in transform.columns[0].transformations[0].params.values()
                    if isinstance(p, list) and len(p) > 1
                ],
            )
            if transform.columns
            and transform.columns[0].transformations
            and transform.columns[0].transformations[0].params
            else 1
            for transform in config_dict.transforms
        )
        if config_dict.transforms
        else 0
    )

    total_files = encoding_files + split_files + transform_files

    logger.info(f"Successfully generated {total_files} component configs:")
    logger.info(f"  - 1 encoding config ({base_name}_encode.yaml)")
    logger.info(f"  - {split_files} split configs ({base_name}_*_split.yaml)")
    logger.info(f"  - {transform_files} transform configs ({base_name}_*_transform.yaml)")
    logger.info(f"All files saved to: {out_dir_path}")