NHSDigital · warren-davies4 · Feb 20, 2026 · Mar 20, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,8 +1,10 @@
 {
+	"image": "mcr.microsoft.com/devcontainers/python:3.11",
 	"features": {
 		"ghcr.io/devcontainers/features/java:1": {
 			"version": "8"
 		},
+		"ghcr.io/devcontainers/features/git-lfs:1": {}
 	},
-	"postCreateCommand": "pip install -r requirements.txt"
-}
+	"postCreateCommand": "sudo apt update && sudo apt install -y pipx && pipx ensurepath && pipx install poetry && unset VIRTUAL_ENV && poetry install --with dev"
+}
diff --git a/.gitignore b/.gitignore
@@ -6,10 +6,17 @@
 #####################################################################
 # Avoid committing any data. If you need to make an exception for one file you can do that separately
 
+.claude/
+test.py
 *.csv
 # allow csv files in the backtests folder
 !tests/backtests/ground_truth/*.csv
 
+logs/*
+!logs/.gitkeep
+
+
+
 *.ipynb
 *.ipynb_checkpoints
 *.xlsx

diff --git a/README.md b/README.md
diff --git a/config.toml b/config.toml
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,102 @@
+project_name: "maternity_to_cml"
+publication_date: &publication_date "01/12/2026"
+last_ingest_timestamp: &last_ingest_timestamp "15/12/2026"
+path_to_source_data: "data_in/mbrrace_test.csv"
+output_dir: "data_out"
+log_dir: "logs/"
+dimensions: &dimensions
+  - mbrrace_grouping
+  - AgeAtBookingMotherAvg
+  - AgeAtBookingMotherGroup
+  - ApgarScore5TermGroup7
+  - BabyFirstFeedBreastMilkStatus
+  - BirthweightTermGroup
+  - BirthweightTermGroup2500
+  - CCP_Antenatal
+  - CCP_Any_Pathways
+  - CCP_Birth
+  - CCP_Postpartum
+  - CO_Concentration_Delivery
+  - ComplexSocialFactorsInd
+  - DeliveryMethodBabyGroup
+  - DeprivationDecileAtBooking
+  - EthnicCategoryMotherGroup
+  - FolicAcidSupplement
+  - GestAgeFormalAntenatalBookingGroup
+  - GestationLengthBirth
+  - GestationLengthBirthGroup37
+  - PCP_Antenatal
+  - PCP_Any_Pathways
+  - PCP_Birth
+  - PCP_Postpartum
+  - PlaceTypeActualDeliveryMidwifery
+  - PreviousCaesareanSectionsGroup
+  - PreviousLiveBirthsGroup
+  - SkinToSkinContact1HourTerm
+  - SmokingStatusGroupBooking
+  - TotalBabies
+  - TotalBookings
+  - TotalDeliveries
+dimension_creation_exclusions:
+  - mbrrace_grouping
+processing_funcs:
+  - name: move_attributes_to_new_dimension
+    params:
+      source_col_name: "Org_Code"
+      source_col_fill_value: "england"
+      new_col_name: "mbrrace_grouping"
+      new_col_fill_value: "no_mbrrace_grouping_filter"
+      attributes_to_move:
+        - "Group 1. Level 3 NICU & NS"
+        - "Group 2. Level 3 NICU"
+        - "Group 3. 4,000 or more"
+        - "Group 4. 2,000 - 3,999"
+        - "Group 5. Under 2,000"
+  - name: replace_col_values
+    params:
+      col_name: "Org_Code"
+      value_mappings:
+        ALL: "england"
+  - name: rename_cols
+    params:
+      col_name_mappings:
+        Org_Code: "location_id"
+        Org_Level: "location_type"
+        Final_value: "metric_value"
+        ReportingPeriodStartDate: "reporting_period_start_datetime"
+        ReportingPeriodEndDate: "last_record_timestamp"
+  - name: cast_date_col_to_timestamp
+    params:
+      col_name: reporting_period_start_datetime
+  - name: cast_date_col_to_timestamp
+    params:
+      col_name: last_record_timestamp
+  - name: create_uuid_col
+    params:
+      col_name: "datapoint_id"
+      length: 32
+  - name: concat_cols
+    params:
+      new_col_name: "metric_id"
+      cols_to_concat: ["Dimension", "Count_Of"]
+      prefix: ""
+      sep: "_"
+  - name: add_lit_col
+    params:
+      col_name: "publication_date"
+      col_value: *publication_date
+  - name: cast_date_col_to_timestamp
+    params:
+      col_name: publication_date
+  - name: add_lit_col 
+    params:
+      col_name: "last_ingest_timestamp"
+      col_value: *last_ingest_timestamp
+  - name: cast_date_col_to_timestamp
+    params:
+      col_name: last_ingest_timestamp
+  - name: add_lit_col
+    params:
+      col_name: "additional_metric_values"
+      col_value: null
+
diff --git a/create_cml_tables.py b/create_cml_tables.py
@@ -0,0 +1,87 @@
+import logging
+import timeit
+from datetime import datetime
+
+from pyspark.sql import functions as F
+from cml_conversion_helpers.processing import processing
+from cml_conversion_helpers.processing import dimension_cohorts
+from cml_schemas import spark_schemas
+
+from msds_monthly_to_cml.utils import file_paths
+from msds_monthly_to_cml.utils import logging_config
+from msds_monthly_to_cml.utils import spark as spark_utils
+from msds_monthly_to_cml.data_ingestion import reading_data
+from msds_monthly_to_cml.data_exports import write_csv
+
+
+logger = logging.getLogger(__name__)
+
+def main():
+
+    # load config - here we load our project's parameters from the config file.
+    config = file_paths.get_config("config.yaml")
+
+
+    # configure logging - we can save information to log files which can be useful for debugging with logger.info()
+    logging_config.configure_logging(config['log_dir'])
+    logger.info(f"Configured logging with log folder: {config['log_dir']}.")
+    logger.info(f"Logging the config settings:\n\n\t{config}\n")
+    logger.info(f"Starting run at:\t{datetime.now().time()}")
+
+
+    # create spark session
+    spark = spark_utils.create_spark_session(config['project_name'])
+    logger.info(f"Created SparkSession with name: {config['project_name']}.")
+
+
+    # Loading data from CSV as spark data frame
+    df_maternity = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_source_data'])
+    logger.info(f"Loaded source data from: {config['path_to_source_data']}.")
+
+
+    # loop through the processing functions defined in the config
+    logger.info(f"running functions defined in config...")
+    for processing_func_config in config["processing_funcs"]:
+        logger.info(f"   running {processing_func_config['name']}")
+        processing_func = processing.PROCESSING_FUNC_REGISTRY[processing_func_config["name"]]
+        df_maternity = processing_func(df_maternity, **processing_func_config["params"])
+    logger.info(f"done!")
+
+
+    # create the columns needed for the dimensions table
+    df_maternity = dimension_cohorts.create_dimension_table(
+        df_maternity,
+        config["dimensions"],
+        config["dimension_creation_exclusions"]
+    )
+    df_maternity = processing.concat_cols(df_maternity, "metric_dimension_id", ["metric_id", "dimension_cohort_id"], sep="_")
+    logger.info(f"created the columns needed for the dimensions table.")
+
+
+    # now df_maternity has all the columns needed for the dimensions and metric tables. the spark_schemas module from the cml_schemas
+    # package contains the schemas for each table. we can use the select_from_schema() function to select the columns
+    # that belong to each schema, which leaves us with two new dataframes, one for each table.
+    dimensions_schema = spark_schemas.create_dimensions_schema(config["dimensions"])
+    df_dimensions = spark_schemas.select_from_schema(df_maternity, dimensions_schema)
+    df_metric = spark_schemas.select_from_schema(df_maternity, spark_schemas.METRIC_SCHEMA)
+    logger.info(f"created df_metric and df_dimensions")
+
+
+    # Then we can save these to CSV
+    logger.info(f"writing data to csv...")
+    write_csv.save_df_as_named_csv(df_metric, "metric")
+    write_csv.save_df_as_named_csv(df_dimensions, "dimensions")
+    logger.info(f"   done!")
+
+
+    # stop the spark session
+    logger.info(f"stopping the SparkSession.")
+    spark.stop()
+
+
+if __name__ == "__main__":
+    print(f"Running create_cml_tables script")
+    start_time = timeit.default_timer()
+    main()
+    total_time = timeit.default_timer() - start_time
+    logger.info(f"Running time of create_cml_tables script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\n")
diff --git a/create_publication.py b/create_publication.py
diff --git a/environment.yml b/environment.yml
diff --git a/src/data_exports/__init__.py → logs/.gitkeep b/src/data_exports/__init__.py → logs/.gitkeep