Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
{
"image": "mcr.microsoft.com/devcontainers/python:3.11",
"features": {
"ghcr.io/devcontainers/features/java:1": {
"version": "8"
},
"ghcr.io/devcontainers/features/git-lfs:1": {}
},
"postCreateCommand": "pip install -r requirements.txt"
}
"postCreateCommand": "sudo apt update && sudo apt install -y pipx && pipx ensurepath && pipx install poetry && unset VIRTUAL_ENV && poetry install --with dev"
}
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,17 @@
#####################################################################
# Avoid committing any data. If you need to make an exception for one file you can do that separately

.claude/
test.py
*.csv
# allow csv files in the backtests folder
!tests/backtests/ground_truth/*.csv

logs/*
!logs/.gitkeep



*.ipynb
*.ipynb_checkpoints
*.xlsx
Expand Down
415 changes: 242 additions & 173 deletions README.md

Large diffs are not rendered by default.

8 changes: 0 additions & 8 deletions config.toml

This file was deleted.

102 changes: 102 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
project_name: "maternity_to_cml"
publication_date: &publication_date "01/12/2026"
last_ingest_timestamp: &last_ingest_timestamp "15/12/2026"
path_to_source_data: "data_in/mbrrace_test.csv"
output_dir: "data_out"
log_dir: "logs/"
dimensions: &dimensions
- mbrrace_grouping
- AgeAtBookingMotherAvg
- AgeAtBookingMotherGroup
- ApgarScore5TermGroup7
- BabyFirstFeedBreastMilkStatus
- BirthweightTermGroup
- BirthweightTermGroup2500
- CCP_Antenatal
- CCP_Any_Pathways
- CCP_Birth
- CCP_Postpartum
- CO_Concentration_Delivery
- ComplexSocialFactorsInd
- DeliveryMethodBabyGroup
- DeprivationDecileAtBooking
- EthnicCategoryMotherGroup
- FolicAcidSupplement
- GestAgeFormalAntenatalBookingGroup
- GestationLengthBirth
- GestationLengthBirthGroup37
- PCP_Antenatal
- PCP_Any_Pathways
- PCP_Birth
- PCP_Postpartum
- PlaceTypeActualDeliveryMidwifery
- PreviousCaesareanSectionsGroup
- PreviousLiveBirthsGroup
- SkinToSkinContact1HourTerm
- SmokingStatusGroupBooking
- TotalBabies
- TotalBookings
- TotalDeliveries
dimension_creation_exclusions:
- mbrrace_grouping
processing_funcs:
- name: move_attributes_to_new_dimension
params:
source_col_name: "Org_Code"
source_col_fill_value: "england"
new_col_name: "mbrrace_grouping"
new_col_fill_value: "no_mbrrace_grouping_filter"
attributes_to_move:
- "Group 1. Level 3 NICU & NS"
- "Group 2. Level 3 NICU"
- "Group 3. 4,000 or more"
- "Group 4. 2,000 - 3,999"
- "Group 5. Under 2,000"
- name: replace_col_values
params:
col_name: "Org_Code"
value_mappings:
ALL: "england"
- name: rename_cols
params:
col_name_mappings:
Org_Code: "location_id"
Org_Level: "location_type"
Final_value: "metric_value"
ReportingPeriodStartDate: "reporting_period_start_datetime"
ReportingPeriodEndDate: "last_record_timestamp"
- name: cast_date_col_to_timestamp
params:
col_name: reporting_period_start_datetime
- name: cast_date_col_to_timestamp
params:
col_name: last_record_timestamp
- name: create_uuid_col
params:
col_name: "datapoint_id"
length: 32
- name: concat_cols
params:
new_col_name: "metric_id"
cols_to_concat: ["Dimension", "Count_Of"]
prefix: ""
sep: "_"
- name: add_lit_col
params:
col_name: "publication_date"
col_value: *publication_date
- name: cast_date_col_to_timestamp
params:
col_name: publication_date
- name: add_lit_col
params:
col_name: "last_ingest_timestamp"
col_value: *last_ingest_timestamp
- name: cast_date_col_to_timestamp
params:
col_name: last_ingest_timestamp
- name: add_lit_col
params:
col_name: "additional_metric_values"
col_value: null

87 changes: 87 additions & 0 deletions create_cml_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import logging
import timeit
from datetime import datetime

from pyspark.sql import functions as F
from cml_conversion_helpers.processing import processing
from cml_conversion_helpers.processing import dimension_cohorts
from cml_schemas import spark_schemas

from msds_monthly_to_cml.utils import file_paths
from msds_monthly_to_cml.utils import logging_config
from msds_monthly_to_cml.utils import spark as spark_utils
from msds_monthly_to_cml.data_ingestion import reading_data
from msds_monthly_to_cml.data_exports import write_csv


logger = logging.getLogger(__name__)

def main():

# load config - here we load our project's parameters from the config file.
config = file_paths.get_config("config.yaml")


# configure logging - we can save information to log files which can be useful for debugging with logger.info()
logging_config.configure_logging(config['log_dir'])
logger.info(f"Configured logging with log folder: {config['log_dir']}.")
logger.info(f"Logging the config settings:\n\n\t{config}\n")
logger.info(f"Starting run at:\t{datetime.now().time()}")


# create spark session
spark = spark_utils.create_spark_session(config['project_name'])
logger.info(f"Created SparkSession with name: {config['project_name']}.")


# Loading data from CSV as spark data frame
df_maternity = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_source_data'])
logger.info(f"Loaded source data from: {config['path_to_source_data']}.")


# loop through the processing functions defined in the config
logger.info(f"running functions defined in config...")
for processing_func_config in config["processing_funcs"]:
logger.info(f" running {processing_func_config['name']}")
processing_func = processing.PROCESSING_FUNC_REGISTRY[processing_func_config["name"]]
df_maternity = processing_func(df_maternity, **processing_func_config["params"])
logger.info(f"done!")


# create the columns needed for the dimensions table
df_maternity = dimension_cohorts.create_dimension_table(
df_maternity,
config["dimensions"],
config["dimension_creation_exclusions"]
)
df_maternity = processing.concat_cols(df_maternity, "metric_dimension_id", ["metric_id", "dimension_cohort_id"], sep="_")
logger.info(f"created the columns needed for the dimensions table.")


# now df_maternity has all the columns needed for the dimensions and metric tables. the spark_schemas module from the cml_schemas
# package contains the schemas for each table. we can use the select_from_schema() function to select the columns
# that belong to each schema, which leaves us with two new dataframes, one for each table.
dimensions_schema = spark_schemas.create_dimensions_schema(config["dimensions"])
df_dimensions = spark_schemas.select_from_schema(df_maternity, dimensions_schema)
df_metric = spark_schemas.select_from_schema(df_maternity, spark_schemas.METRIC_SCHEMA)
logger.info(f"created df_metric and df_dimensions")


# Then we can save these to CSV
logger.info(f"writing data to csv...")
write_csv.save_df_as_named_csv(df_metric, "metric")
write_csv.save_df_as_named_csv(df_dimensions, "dimensions")
logger.info(f" done!")


# stop the spark session
logger.info(f"stopping the SparkSession.")
spark.stop()


if __name__ == "__main__":
print(f"Running create_cml_tables script")
start_time = timeit.default_timer()
main()
total_time = timeit.default_timer() - start_time
logger.info(f"Running time of create_cml_tables script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\n")
85 changes: 0 additions & 85 deletions create_publication.py

This file was deleted.

12 changes: 0 additions & 12 deletions environment.yml

This file was deleted.

File renamed without changes.
Loading