diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 02af652..4146e9a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,8 +1,10 @@ { + "image": "mcr.microsoft.com/devcontainers/python:3.11", "features": { "ghcr.io/devcontainers/features/java:1": { "version": "8" }, + "ghcr.io/devcontainers/features/git-lfs:1": {} }, - "postCreateCommand": "pip install -r requirements.txt" -} + "postCreateCommand": "sudo apt update && sudo apt install -y pipx && pipx ensurepath && pipx install poetry && unset VIRTUAL_ENV && poetry install --with dev" +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 292d171..8863792 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,17 @@ ##################################################################### # Avoid committing any data. If you need to make an exception for one file you can do that separately +.claude/ +test.py *.csv # allow csv files in the backtests folder !tests/backtests/ground_truth/*.csv +logs/* +!logs/.gitkeep + + + *.ipynb *.ipynb_checkpoints *.xlsx diff --git a/README.md b/README.md index fef8e6a..1a723ed 100644 --- a/README.md +++ b/README.md @@ -1,237 +1,306 @@ -# RAP Example Pipeline - Python using PySpark +# Maternity Services Statistics to CML Schema Conversion -:exclamation: Warning: this repository may contain references internal to NHS England that cannot be accessed publicly +## Overview -This repo contains a simple example pipeline to act as an example of RAP good-practice with Python. +This pipeline converts the **Maternity Services Monthly Statistics** into the format required by the **NHS England Central Metrics Library (CML)**. -## Processes Flow-chart +The source data is the Official Statistics about NHS-funded maternity services in England, drawn from the Maternity Services Data Set (MSDS). It covers activity at the booking appointment, during pregnancy, during and after birth, and information on pregnancy outcomes. -```mermaid -flowchart TB - %% Input - A_ts(Artificial HES data) - B_ts(Script to pull down to CSV) - B_one(Script to pull down to API) +The pipeline takes the MSDS data in a tidy (long) format, applies a series of configurable transformations, and produces two output tables matching the CML schema: a **metric table** and a **dimensions table**. - %% Processes - C_ts(All of England) - C_one(Regional distribution) - C_two(ICB distribution) +--- - D_ts(Count the number of episodes) - D_one(Number of unique individuals) +## Prerequisites +- Python >= 3.10 +- Java 8 or 11 (required by PySpark — ensure `JAVA_HOME` is set) +- [Poetry](https://python-poetry.org/) for dependency management - %% Output - E_ts(CSV output) - E_one(Graph output) +--- - %% Generating flowchart - subgraph Input - A_ts:::thin_slice==>B_ts - A_ts-->B_one - end +## Getting Started - subgraph Processes - B_ts:::thin_slice==>C_ts - B_ts-->C_one - B_ts-->C_two +### Installation - - C_ts:::thin_slice==>D_ts - C_ts-->D_one +If you load in Codespaces, the relevant packages should install automatically - it may take a few minutes! - +```bash +poetry install +``` - end +### Running the pipeline - subgraph Output - D_ts:::thin_slice==>E_ts:::thin_slice - D_ts-->E_one - end +Create the virtual environment and run the script: -%% Colour formatting -classDef thin_slice fill:#CCFFCC,stroke:#333,stroke-width:4px +```bash +eval $(poetry env activate) +python create_cml_tables.py ``` +### Running the tests + +```bash +pytest +``` -## Contact -**This repository is maintained by the NHS England Data Science Team**. -> _To contact us raise an issue on Github or via [email](mailto:england.rapchampions@nhs.net)._ -> -> See our (and our colleagues') other work here: [NHS England Analytical Services](https://github.com/NHSDigital/data-analytics-services) +--- -## Description +## Configuration -[Reproducible Analytical Pipelines](https://nhsdigital.github.io/rap-community-of-practice/) can seem quite abstract - so this repo is meant to serve as a real example, that anyone can run, to see RAP in action. +All pipeline parameters are defined in [`config.yaml`](config.yaml). You should update this file for each run. -The pipeline uses artificial HES data, which was chosen as it is "like" real data used in our industry, but also freely available. +Key fields: -This example pipeline uses Apache Spark, which will be installed locally in your environment when you go through the "Getting Started" steps below. +| Field | Description | +|-------|-------------| +| `publication_date` | Publication date for the output (used as a literal column value) | +| `last_ingest_timestamp` | Timestamp of the last data ingest | +| `path_to_source_data` | Path to the input CSV file | +| `output_dir` | Directory where output CSVs are written | +| `log_dir` | Directory where log files are written | +| `dimensions` | List of all dimension names present in the source data | +| `dimension_creation_exclusions` | Dimensions to exclude from the output dimensions table (e.g. `mbrrace_grouping`, which is handled as a separate column) | +| `processing_funcs` | Ordered list of transformation functions to apply, each with a `name` and `params` | -The pipeline follows three steps which are common to almost all analytical pipelines: +### YAML anchors -1. Getting the data - in this case we download the artificial HES data as a CSV which is saved into folder called 'data_in' on your machine (see the code in src/data_ingestion) -2. Processing the data - the data is aggregated using Spark's python API, PySpark (the code for this is in src/processing) -3. Saving the processed data - the processed data is saved as a csv in a folder called 'data_out' (see the code in src/data_exports) +`config.yaml` uses YAML anchors (`&`) and aliases (`*`) to avoid repeating values: -## Prerequisites +```yaml +publication_date: &publication_date "01/12/2026" +last_ingest_timestamp: &last_ingest_timestamp "15/12/2026" +``` -This code requires: +These can then be referenced elsewhere in the file with `*publication_date` and `*last_ingest_timestamp`. -- Python (> 3.6), the official Python website has [instructions for downloading and installing python](https://wiki.python.org/moin/BeginnersGuide/Download). -- Java - you will also need to have [Java 8/11](https://www.java.com/en/download/) installed, and your [JAVA_HOME variable correctly set](https://www.baeldung.com/java-home-on-windows-mac-os-x-linux). +--- -## Getting Started +## Input Format + +The pipeline expects a **tidy (long) format** CSV, where each row represents a single metric value for a given dimension/attribute combination: + +| Org_Code | Org_Level | Dimension | Attribute | Final_value | ReportingPeriodStartDate | ReportingPeriodEndDate | +|----------|-----------|-----------|-----------|-------------|--------------------------|------------------------| +| RXX | Trust | EthnicCategoryMotherGroup | EthnicWhite | 82 | 01/04/2026 | 30/06/2026 | +| RXX | Trust | AgeAtBookingMotherGroup | Age25to29 | 54 | 01/04/2026 | 30/06/2026 | +| ALL | England | EthnicCategoryMotherGroup | EthnicWhite | 79 | 01/04/2026 | 30/06/2026 | + +The `Dimension` column identifies which dimension the row belongs to, and the attribute value (e.g. `EthnicWhite`) sits in the `Attribute` column. + +--- + +## Output Format + +Two CSVs are written to `data_out/`: + +### Metric table (`data_out/metric/metric.csv`) + +One row per data point, containing the numeric value and metadata: + +| datapoint_id | metric_id | metric_dimension_id | location_id | location_type | metric_value | reporting_period_start_datetime | last_record_timestamp | publication_date | last_ingest_timestamp | additional_metric_values | +|---|---|---|---|---|---|---|---|---|---|---| + +### Dimensions table (`data_out/dimensions/dimensions.csv`) + +One row per data point, one column per dimension. Each dimension column defaults to `all_` unless the data point belongs to that dimension: + +| datapoint_id | metric_dimension_id | dimension_cohort_id | EthnicCategoryMotherGroup | AgeAtBookingMotherGroup | ... | +|---|---|---|---|---|---| + +The `dimension_cohort_id` is a `|`-separated concatenation of all dimension column values and links the metric and dimensions tables together. + +--- + +## Pipeline Steps + +The transformation logic is defined as an ordered sequence in `config.yaml` under `processing_funcs`. Each entry maps to a registered function in the `cml_conversion_helpers` library via `PROCESSING_FUNC_REGISTRY`. + +The steps applied in this pipeline are: + +1. **`move_attributes_to_new_dimension`** — moves MBRRACE grouping values (e.g. `"Group 1. Level 3 NICU & NS"`) out of `Org_Code` into a new `mbrrace_grouping` column +2. **`replace_col_values`** — replaces `"ALL"` in `Org_Code` with `"england"` +3. **`rename_cols`** — renames source columns to CML schema names (`Org_Code` → `location_id`, etc.) +4. **`cast_date_col_to_timestamp`** — casts date string columns to timestamps +5. **`create_uuid_col`** — generates a unique `datapoint_id` per row +6. **`concat_cols`** — builds `metric_id` by concatenating `Dimension` and `Count_Of` +7. **`add_lit_col`** + **`cast_date_col_to_timestamp`** — adds `publication_date` and `last_ingest_timestamp` as typed columns +8. **`add_lit_col`** — adds `additional_metric_values` as a null column + +You don't have to use this config-driven approach if you don't want to. You can simply add your PySpark code into the create_cml_tables.py file. + +After these steps, `create_dimension_table` builds the per-dimension columns and `dimension_cohort_id`, and a final `concat_cols` call builds `metric_dimension_id`. + +See the [`cml_conversion_helpers` API reference](#api-reference) below for full details on each function. + +--- + +## Project Structure -1. Clone the repository. To learn about what this means, and how to use Git, see the [Git guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/git/using-git-collaboratively/). +``` +├── create_cml_tables.py <- Entry point — runs the full pipeline +├── config.yaml <- Pipeline parameters and processing steps +│ +├── src/ +│ └── msds_monthly_to_cml/ +│ ├── data_ingestion/ +│ │ ├── get_data.py <- Utilities for fetching source data +│ │ └── reading_data.py <- Loads CSV into a Spark DataFrame +│ ├── data_exports/ +│ │ └── write_csv.py <- Saves Spark DataFrames as named CSVs +│ └── utils/ +│ ├── file_paths.py <- Loads config.yaml +│ ├── logging_config.py <- Configures file and console logging +│ └── spark.py <- Creates and configures a SparkSession +│ +├── tests/ +│ ├── conftest.py <- Shared pytest fixtures (SparkSession) +│ └── unittests/ +│ └── test_spark.py +│ +├── data_in/ <- Place source CSV here (not committed) +├── data_out/ <- Output CSVs written here (not committed) +└── logs/ <- Log files written here (not committed) +``` + +--- + +## API Reference + +The transformation functions used in this pipeline are provided by the [`cml_conversion_helpers`](https://pypi.org/project/cml-conversion-helpers/) package. The key functions are documented below. + +### Processing functions (`cml_conversion_helpers.processing.processing`) + +All functions are available via `PROCESSING_FUNC_REGISTRY` for config-driven use. + +--- +#### `move_attributes_to_new_dimension` + +Moves specified values from one column into a new dimension column. Rows whose `source_col_name` value is in `attributes_to_move` have that value placed into `new_col_name`, and `source_col_name` is replaced with `source_col_fill_value`. All other rows get `new_col_fill_value` in `new_col_name`. + +```python +df = processing.move_attributes_to_new_dimension( + df, + source_col_name="Org_Code", + source_col_fill_value="england", + new_col_name="mbrrace_grouping", + new_col_fill_value="no_mbrrace_grouping_filter", + attributes_to_move=["Group 1. Level 3 NICU & NS", "Group 2. Level 3 NICU"] +) ``` -git clone https://github.com/NHSDigital/RAP_example_pipeline_python + +--- + +#### `rename_cols` + +Renames columns according to a mapping. Unmapped columns are left unchanged. + +```python +df = processing.rename_cols(df, {"Org_Code": "location_id", "Final_value": "metric_value"}) ``` -2. Set up your environment, _either_ using [pip](https://pypi.org/project/pip/) or [conda](https://www.anaconda.com/). For more information on how to use virtual environments and why they are important,. see the [virtual environments guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/python/virtual-environments/why-use-virtual-environments/). +--- -### Using pip +#### `replace_col_values` -If you're using Windows, enter the following commands into the Command Line or Powershell: +Replaces values in a column using a mapping dictionary. +```python +df = processing.replace_col_values(df, {"ALL": "england"}, "Org_Code") ``` -python -m venv .venv -.\.venv\Scripts\Activate.ps1 -python -m pip install -r requirements.txt + +--- + +#### `concat_cols` + +Concatenates multiple columns into a new column. + +```python +df = processing.concat_cols(df, "metric_id", ["Dimension", "Count_Of"], sep="_") ``` -If you're using Linux or MacOS, enter the following commands into the Terminal: +--- + +#### `create_uuid_col` + +Adds a column containing a truncated UUID string (hyphens removed). +```python +df = processing.create_uuid_col(df, "datapoint_id", length=32) ``` -python -m venv .venv -source venv/bin/activate -python -m pip install -r requirements.txt + +--- + +#### `cast_date_col_to_timestamp` + +Casts a string date column to a timestamp (default format: `dd/MM/yyyy`). + +```python +df = processing.cast_date_col_to_timestamp(df, "reporting_period_start_datetime") ``` -For Visual Studio Code it is necessary that you change your default interpreter to the virtual environment you just created .venv. To do this use the shortcut Ctrl-Shift-P, search for Python: Select interpreter and select .venv from the list. +--- -## Using GitHub codespaces +#### `add_lit_col` -If you are using GitHub Codespaces, the above installation steps will be completed automatically, so you don't need to do anything! +Adds a new column populated with a constant value. Use `None` (Python) or `null` (YAML) for null. -Click the "Code" button above, click the "Codespaces" tab, and then click the "+" button to create a new codespace. The environment may take a minute or two to build when you load it for the first time. +```python +df = processing.add_lit_col(df, "publication_date", "01/12/2026") +df = processing.add_lit_col(df, "additional_metric_values", None) +``` -## Running the pipeline -Before running the pipeline, make sure you are in the same folder as the `create-publication.py` file by entering the following command into the terminal: +--- -`cd RAP_example_pipeline_python` +#### `drop_cols` -To run the pipeline, enter the following command into the terminal: +Drops specified columns from a DataFrame. -`python create_publication.py` +```python +df = processing.drop_cols(df, ["unwanted_col_a", "unwanted_col_b"]) +``` -## Running the tests -There are two sets of tests in this structure (and you can see guidance on them by following the hyperlinks): +--- -* **[Unit tests](https://nhsdigital.github.io/rap-community-of-practice/training_resources/python/unit-testing/)**: these test functions in isolation to ensure they do what you expect them to. -* **[Back tests](https://nhsdigital.github.io/rap-community-of-practice/training_resources/python/backtesting/)**: when you refactor a pipeline or re-create it entirely, it's a good idea to compare the results of the old process (often referred to as the "ground truth") to the results of the new pipeline. This is what the back tests do. Here, the back tests will first check if the output files exist in the data_out folder, and if not, it will run the pipeline and create these files so that it can compare them to the ground truth files (stored in the `tests/backtests/ground_truth/` folder). Note that you don't need to commit your ground truth files to your repo (for example if they are very large or contain sensitive data). +### Dimension functions (`cml_conversion_helpers.processing.dimension_cohorts`) -To run all tests, enter the following terminal command: +--- -`python -m pytest tests/` +#### `create_dimension_table` -If you just want to run the back tests, you can use: +Main entry point for building the dimensions table. Creates one column per dimension (populated with the attribute value for matching rows, `all_` otherwise) and a `dimension_cohort_id` column. -`python -m pytest tests/backtests` +```python +df = dimension_cohorts.create_dimension_table( + df, + dimension_cols=config["dimensions"], + dimensions_to_exclude=config["dimension_creation_exclusions"] +) +``` -And if you just want to run the unit tests, use: +--- -`python -m pytest tests/unittests` +#### `get_dimension_list_from_col` -## Project structure +Extracts the list of distinct values from a dimension column — useful when you want to derive the dimension list from the data rather than hard-coding it in config. -```text -| .gitignore <- Files (& file types) automatically removed from version control for security purposes -| config.toml <- Configuration file with parameters we want to be able to change (e.g. date) -| environment.yml <- Conda equivalent of requirements file -| requirements.txt <- Requirements for reproducing the analysis environment -| pyproject.toml <- Configuration file containing package build information -| LICENCE <- License info for public distribution -| README.md <- Quick start guide / explanation of the project -| -| create_publication.py <- Runs the overall pipeline to produce the publication -| -+---data_in <- Data downloaded from external sources can be saved here. Files in here will not be committed -| | .gitkeep <- This is a placeholder file that enables the otherwise empty directory to be committed -| | -+---data_out <- Any data saved as files will be stored here. Files in here will not be committed -| | .gitkeep <- This is a placeholder file that enables the otherwise empty directory to be committed -| | -+---src <- Scripts with functions for use in 'create_publication.py'. Contains the project's codebase. -| | __init__.py <- Makes the functions folder an importable Python module -| | -| +---data_exports -| | __init__.py <- Makes the folder an importable Python module -| | write_excel.py <- Populates an excel .xlsx template with values from your CSV output if needed -| | write_csv.py <- Creates CSV outputs from the data manipulated in python -| | -| +---data_ingestion <- Scripts with modules containing functions to import and preprocess read data i.e. perform validation/data quality checks, other preprocessing etc. -| | __init__.py <- Makes the folder an importable Python module -| | get_data.py <- Gets data from external sources -| | preprocessing.py <- Perform preprocessing, for example preparing your data for metadata or data quality checks -| | reading_data.py <- Read data from CSVs and other sources into formats that can be manipulated in python -| | validation_checks.py <- Perform validation checks e.g. a field has acceptable values -| | -| +---processing <- Scripts with modules containing functions to process data i.e. clean and derive new fields -| | __init__.py <- Makes the folder an importable Python module -| | aggregate_counts.py <- Functions that create the aggregate counts needed in the outputs -| | -| +---utils <- Scripts relating to configuration and handling data connections e.g. importing data, writing to a database etc. -| | __init__.py <- Makes the folder an importable Python module -| | file_paths.py <- Configures file paths for the package -| | logging_config.py <- Configures logging -| | spark.py <- Functions that set up and configure Spark -| | -+---tests -| | __init__.py <- Makes the functions folder an importable Python module -| | -| +---backtests <- Comparison tests for the old and new pipeline's outputs -| | | __init__.py <- Makes the folder an importable Python module -| | | backtesting_params.py <- parameters for back tests, such as the location of ground truth files -| | | test_compare_outputs.py <- runs the back tests -| | | -| | +---ground_truth <- ground truth outputs from the old process to compare against the new one -| | | -| +---unittests <- Tests for the functional outputs of Python code -| | __init__.py <- Makes the folder an importable Python module -| | test_aggregate_counts.py <- Test functions that process/manipulate the data -| | test_spark <- Test functions related to setting up and configuring spark +```python +dimensions = dimension_cohorts.get_dimension_list_from_col(df, "Dimension") ``` -### `root` +--- + +### Extending the registry -In the highest level of this repository (known as the 'root'), there is one Python file: `create_publication.py`. This top level file should be the main place where users interact with the code, where you store the steps to create your publication. +You can add your own functions to `PROCESSING_FUNC_REGISTRY` using the `@register` decorator: + +```python +from cml_conversion_helpers.processing.processing import register + +@register +def my_custom_transform(df, some_param): + # your logic here + return df +``` -This file currently runs a set of example steps using example data. - -### `src` - -This directory contains the meaty parts of the code. By organising the code into logical sections, we make it easier to understand, maintain and test. Moreover, tucking the complex code out of the way means that users don't need to understand everything about the code all at once. - -### `tests` - -This folder contains the tests for the code base. It's good practice to have unit tests for your functions at the very least, ideally in addition to tests of the pipeline as a whole such as back tests. - ------------ - -## Licence - -This codebase is released under the MIT License. This covers both the codebase and any sample code in the documentation. - -Any HTML or Markdown documentation is [© Crown copyright](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/) and available under the terms of the [Open Government 3.0 licence](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). - -## Acknowledgements -- [Connor Quinn](https://github.com/connor1q) -- [Sam Hollings](https://github.com/SamHollings) -- [Maakhe Ndhlela](https://github.com/maakhe) -- [Harriet Sands](https://github.com/harrietrs) -- [Xiyao Zhuang](https://github.com/xiyaozhuang) -- [Helen Richardson](https://github.com/helrich) -- [The RAP team](https://github.com/NHSDigital/rap-community-of-practice)! +--- \ No newline at end of file diff --git a/config.toml b/config.toml deleted file mode 100644 index 21c7a8f..0000000 --- a/config.toml +++ /dev/null @@ -1,8 +0,0 @@ -project_name = "example_pipeline_pyspark_version" - -data_url = "https://files.digital.nhs.uk/assets/Services/Artificial%20data/Artificial%20HES%20final/artificial_hes_ae_202302_v1_sample.zip" -path_to_downloaded_data = "data_in/artificial_hes_ae_202302_v1_sample.zip/artificial_hes_ae_202302_v1_sample/artificial_hes_ae_2122.csv" - -# Here we describe where the output and logs are saved, change as necessary -output_dir = '' -log_dir = '' diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..bca931f --- /dev/null +++ b/config.yaml @@ -0,0 +1,102 @@ +project_name: "maternity_to_cml" +publication_date: &publication_date "01/12/2026" +last_ingest_timestamp: &last_ingest_timestamp "15/12/2026" +path_to_source_data: "data_in/mbrrace_test.csv" +output_dir: "data_out" +log_dir: "logs/" +dimensions: &dimensions + - mbrrace_grouping + - AgeAtBookingMotherAvg + - AgeAtBookingMotherGroup + - ApgarScore5TermGroup7 + - BabyFirstFeedBreastMilkStatus + - BirthweightTermGroup + - BirthweightTermGroup2500 + - CCP_Antenatal + - CCP_Any_Pathways + - CCP_Birth + - CCP_Postpartum + - CO_Concentration_Delivery + - ComplexSocialFactorsInd + - DeliveryMethodBabyGroup + - DeprivationDecileAtBooking + - EthnicCategoryMotherGroup + - FolicAcidSupplement + - GestAgeFormalAntenatalBookingGroup + - GestationLengthBirth + - GestationLengthBirthGroup37 + - PCP_Antenatal + - PCP_Any_Pathways + - PCP_Birth + - PCP_Postpartum + - PlaceTypeActualDeliveryMidwifery + - PreviousCaesareanSectionsGroup + - PreviousLiveBirthsGroup + - SkinToSkinContact1HourTerm + - SmokingStatusGroupBooking + - TotalBabies + - TotalBookings + - TotalDeliveries +dimension_creation_exclusions: + - mbrrace_grouping +processing_funcs: + - name: move_attributes_to_new_dimension + params: + source_col_name: "Org_Code" + source_col_fill_value: "england" + new_col_name: "mbrrace_grouping" + new_col_fill_value: "no_mbrrace_grouping_filter" + attributes_to_move: + - "Group 1. Level 3 NICU & NS" + - "Group 2. Level 3 NICU" + - "Group 3. 4,000 or more" + - "Group 4. 2,000 - 3,999" + - "Group 5. Under 2,000" + - name: replace_col_values + params: + col_name: "Org_Code" + value_mappings: + ALL: "england" + - name: rename_cols + params: + col_name_mappings: + Org_Code: "location_id" + Org_Level: "location_type" + Final_value: "metric_value" + ReportingPeriodStartDate: "reporting_period_start_datetime" + ReportingPeriodEndDate: "last_record_timestamp" + - name: cast_date_col_to_timestamp + params: + col_name: reporting_period_start_datetime + - name: cast_date_col_to_timestamp + params: + col_name: last_record_timestamp + - name: create_uuid_col + params: + col_name: "datapoint_id" + length: 32 + - name: concat_cols + params: + new_col_name: "metric_id" + cols_to_concat: ["Dimension", "Count_Of"] + prefix: "" + sep: "_" + - name: add_lit_col + params: + col_name: "publication_date" + col_value: *publication_date + - name: cast_date_col_to_timestamp + params: + col_name: publication_date + - name: add_lit_col + params: + col_name: "last_ingest_timestamp" + col_value: *last_ingest_timestamp + - name: cast_date_col_to_timestamp + params: + col_name: last_ingest_timestamp + - name: add_lit_col + params: + col_name: "additional_metric_values" + col_value: null + diff --git a/create_cml_tables.py b/create_cml_tables.py new file mode 100644 index 0000000..325aad2 --- /dev/null +++ b/create_cml_tables.py @@ -0,0 +1,87 @@ +import logging +import timeit +from datetime import datetime + +from pyspark.sql import functions as F +from cml_conversion_helpers.processing import processing +from cml_conversion_helpers.processing import dimension_cohorts +from cml_schemas import spark_schemas + +from msds_monthly_to_cml.utils import file_paths +from msds_monthly_to_cml.utils import logging_config +from msds_monthly_to_cml.utils import spark as spark_utils +from msds_monthly_to_cml.data_ingestion import reading_data +from msds_monthly_to_cml.data_exports import write_csv + + +logger = logging.getLogger(__name__) + +def main(): + + # load config - here we load our project's parameters from the config file. + config = file_paths.get_config("config.yaml") + + + # configure logging - we can save information to log files which can be useful for debugging with logger.info() + logging_config.configure_logging(config['log_dir']) + logger.info(f"Configured logging with log folder: {config['log_dir']}.") + logger.info(f"Logging the config settings:\n\n\t{config}\n") + logger.info(f"Starting run at:\t{datetime.now().time()}") + + + # create spark session + spark = spark_utils.create_spark_session(config['project_name']) + logger.info(f"Created SparkSession with name: {config['project_name']}.") + + + # Loading data from CSV as spark data frame + df_maternity = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_source_data']) + logger.info(f"Loaded source data from: {config['path_to_source_data']}.") + + + # loop through the processing functions defined in the config + logger.info(f"running functions defined in config...") + for processing_func_config in config["processing_funcs"]: + logger.info(f" running {processing_func_config['name']}") + processing_func = processing.PROCESSING_FUNC_REGISTRY[processing_func_config["name"]] + df_maternity = processing_func(df_maternity, **processing_func_config["params"]) + logger.info(f"done!") + + + # create the columns needed for the dimensions table + df_maternity = dimension_cohorts.create_dimension_table( + df_maternity, + config["dimensions"], + config["dimension_creation_exclusions"] + ) + df_maternity = processing.concat_cols(df_maternity, "metric_dimension_id", ["metric_id", "dimension_cohort_id"], sep="_") + logger.info(f"created the columns needed for the dimensions table.") + + + # now df_maternity has all the columns needed for the dimensions and metric tables. the spark_schemas module from the cml_schemas + # package contains the schemas for each table. we can use the select_from_schema() function to select the columns + # that belong to each schema, which leaves us with two new dataframes, one for each table. + dimensions_schema = spark_schemas.create_dimensions_schema(config["dimensions"]) + df_dimensions = spark_schemas.select_from_schema(df_maternity, dimensions_schema) + df_metric = spark_schemas.select_from_schema(df_maternity, spark_schemas.METRIC_SCHEMA) + logger.info(f"created df_metric and df_dimensions") + + + # Then we can save these to CSV + logger.info(f"writing data to csv...") + write_csv.save_df_as_named_csv(df_metric, "metric") + write_csv.save_df_as_named_csv(df_dimensions, "dimensions") + logger.info(f" done!") + + + # stop the spark session + logger.info(f"stopping the SparkSession.") + spark.stop() + + +if __name__ == "__main__": + print(f"Running create_cml_tables script") + start_time = timeit.default_timer() + main() + total_time = timeit.default_timer() - start_time + logger.info(f"Running time of create_cml_tables script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\n") diff --git a/create_publication.py b/create_publication.py deleted file mode 100644 index 2eb5ce3..0000000 --- a/create_publication.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Purpose of the script: to provide an example of good practices when structuring a pipeline using PySpark - -The script loads Python packages but also internal modules (e.g. modules.helpers, helpers script from the modules folder). -It then loads various configuration variables and a logger, for more info on see the RAP Community of Practice website: -https://nhsdigital.github.io/rap-community-of-practice/ - -Most of the code to carry out this configuration and setup is found in the utils folder. - -Then, the main pipeline itself begins, which has three phases: - -data_ingestion: - we download the artificial hes data, load it into a spark dataframe. Any other cleaning or preprocessing should - happen at this stage -processing: - we process the data as needed, in this case we create some aggregate counts based on the hes data -data_exports: - finally we write our outputs to an appropriate file type (CSV) - -Note that in the src folder, each of these phases has its own folder, to neatly organise the code used for each one. - -""" - -# this part imports our Python packages, pyspark functions, and our project's own modules -import logging -import timeit -from datetime import datetime - -from pyspark.sql import functions as F - -from src.utils import file_paths -from src.utils import logging_config -from src.utils import spark as spark_utils -from src.data_ingestion import get_data -from src.data_ingestion import reading_data -from src.processing import aggregate_counts -from src.data_exports import write_csv - -logger = logging.getLogger(__name__) - -def main(): - - # load config, here we load our project's parameters from the config.toml file - config = file_paths.get_config() - - # configure logging - logging_config.configure_logging(config['log_dir']) - logger.info(f"Configured logging with log folder: {config['log_dir']}.") - logger.info(f"Logging the config settings:\n\n\t{config}\n") - logger.info(f"Starting run at:\t{datetime.now().time()}") - - # get artificial HES data as CSV - get_data.download_zip_from_url(config['data_url'], overwrite=True) - logger.info(f"Downloaded artificial hes as zip.") - - # create spark session - spark = spark_utils.create_spark_session(config['project_name']) - logger.info(f"created spark session with app name: {config['project_name']}") - - # Loading data from CSV as spark data frame - df_hes_data = reading_data.load_csv_into_spark_data_frame(spark, config['path_to_downloaded_data']) - - # Creating dictionary to hold outputs - outputs = {} - - # Count number of episodes in England - place this in the outputs dictionary - outputs["df_hes_england_count"] = aggregate_counts.get_distinct_count(df_hes_data, 'epikey', 'number_of_episodes') - - # Rename and save spark dataframes as CSVs: - for output_name, output in outputs.items(): - write_csv.save_spark_dataframe_as_csv(output, output_name) - logger.info(f"saved output df {output_name} as csv") - write_csv.rename_csv_output(output_name) - logger.info(f"renamed {output_name} file") - - # stop the spark session - spark.stop() - - -if __name__ == "__main__": - print(f"Running create_publication script") - start_time = timeit.default_timer() - main() - total_time = timeit.default_timer() - start_time - logger.info(f"Running time of create_publication script: {int(total_time / 60)} minutes and {round(total_time%60)} seconds.\n") diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 3e815a2..0000000 --- a/environment.yml +++ /dev/null @@ -1,12 +0,0 @@ -# The libraries used by your code should be listed here -name: rap_template # your project name (no spaces!) -channels: - - default - - conda-forge -dependencies: - - python=3.10.5 - - pip - - pandas=1.4.4 - - pyodbc=4.0.35 - - sqlalchemy=1.4.46 - - toml=0.10.2 diff --git a/src/data_exports/__init__.py b/logs/.gitkeep similarity index 100% rename from src/data_exports/__init__.py rename to logs/.gitkeep diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..0ed87f0 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,365 @@ +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. + +[[package]] +name = "cml-conversion-helpers" +version = "0.2.0" +description = "Helper functions for converting data to the NHS CML schema format." +optional = false +python-versions = "<4,>=3.10" +groups = ["main"] +files = [ + {file = "cml_conversion_helpers-0.2.0-py3-none-any.whl", hash = "sha256:d7fbc2401e7cb8c226fd55d7b225535f6f917aa3fa9e113c778fbf3002f25677"}, + {file = "cml_conversion_helpers-0.2.0.tar.gz", hash = "sha256:9629c3e5c6507afed7d246579b5c857d1cc25744e613c98d43c84e75f768bddd"}, +] + +[package.dependencies] +pyspark = ">=3.5,<4.0" + +[[package]] +name = "cml-schemas" +version = "0.1.2" +description = "CML Spark schemas" +optional = false +python-versions = "<4,>=3.10" +groups = ["main"] +files = [ + {file = "cml_schemas-0.1.2-py3-none-any.whl", hash = "sha256:8bd10233f3cb2e5a29b12e96caf0d1f5c7794dca56f7386e0767ab6f3132b44a"}, + {file = "cml_schemas-0.1.2.tar.gz", hash = "sha256:3ff1eb107e51291c3c2a4105aa2043f025e17cc2c678836f7912b8464f4118a8"}, +] + +[package.dependencies] +pyspark = ">=3.5" + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +markers = "sys_platform == \"win32\"" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.3.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version == \"3.10\"" +files = [ + {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, + {file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "iniconfig" +version = "2.3.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"}, + {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, +] + +[[package]] +name = "packaging" +version = "26.0" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529"}, + {file = "packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4"}, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, + {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["coverage", "pytest", "pytest-benchmark"] + +[[package]] +name = "py4j" +version = "0.10.9.7" +description = "Enables Python programs to dynamically access arbitrary Java objects" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, + {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, +] + +[[package]] +name = "pygments" +version = "2.19.2" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, + {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[[package]] +name = "pyspark" +version = "3.5.8" +description = "Apache Spark Python API" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pyspark-3.5.8.tar.gz", hash = "sha256:54cca0767b21b40e3953ad1d30f8601c53abf9cbda763653289cdcfcac52313c"}, +] + +[package.dependencies] +py4j = ">=0.10.9.7,<0.10.9.10" + +[package.extras] +connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +ml = ["numpy (>=1.15,<2)"] +mllib = ["numpy (>=1.15,<2)"] +pandas-on-spark = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +sql = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] + +[[package]] +name = "pytest" +version = "9.0.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b"}, + {file = "pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11"}, +] + +[package.dependencies] +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""} +iniconfig = ">=1.0.1" +packaging = ">=22" +pluggy = ">=1.5,<2" +pygments = ">=2.7.2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-html" +version = "3.1.1" +description = "pytest plugin for generating HTML reports" +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "pytest-html-3.1.1.tar.gz", hash = "sha256:3ee1cf319c913d19fe53aeb0bc400e7b0bc2dbeb477553733db1dad12eb75ee3"}, + {file = "pytest_html-3.1.1-py3-none-any.whl", hash = "sha256:b7f82f123936a3f4d2950bc993c2c1ca09ce262c9ae12f9ac763a2401380b455"}, +] + +[package.dependencies] +pytest = ">=5.0,<6.0.0 || >6.0.0" +pytest-metadata = "*" + +[[package]] +name = "pytest-metadata" +version = "1.11.0" +description = "pytest plugin for test session metadata" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +groups = ["dev"] +files = [ + {file = "pytest-metadata-1.11.0.tar.gz", hash = "sha256:71b506d49d34e539cc3cfdb7ce2c5f072bea5c953320002c95968e0238f8ecf1"}, + {file = "pytest_metadata-1.11.0-py2.py3-none-any.whl", hash = "sha256:576055b8336dd4a9006dd2a47615f76f2f8c30ab12b1b1c039d99e834583523f"}, +] + +[package.dependencies] +pytest = ">=2.9.0" + +[[package]] +name = "pyyaml" +version = "6.0.3" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"}, + {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"}, + {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"}, + {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"}, + {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"}, + {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b"}, + {file = "pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0"}, + {file = "pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69"}, + {file = "pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e"}, + {file = "pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c"}, + {file = "pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e"}, + {file = "pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d"}, + {file = "pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a"}, + {file = "pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4"}, + {file = "pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b"}, + {file = "pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf"}, + {file = "pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196"}, + {file = "pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc"}, + {file = "pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e"}, + {file = "pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea"}, + {file = "pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5"}, + {file = "pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b"}, + {file = "pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd"}, + {file = "pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8"}, + {file = "pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6"}, + {file = "pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6"}, + {file = "pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be"}, + {file = "pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26"}, + {file = "pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c"}, + {file = "pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb"}, + {file = "pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac"}, + {file = "pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5"}, + {file = "pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764"}, + {file = "pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35"}, + {file = "pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac"}, + {file = "pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3"}, + {file = "pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3"}, + {file = "pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c"}, + {file = "pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065"}, + {file = "pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65"}, + {file = "pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9"}, + {file = "pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b"}, + {file = "pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da"}, + {file = "pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a"}, + {file = "pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926"}, + {file = "pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7"}, + {file = "pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0"}, + {file = "pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007"}, + {file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"}, +] + +[[package]] +name = "tomli" +version = "2.4.0" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version == \"3.10\"" +files = [ + {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"}, + {file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"}, + {file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"}, + {file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"}, + {file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"}, + {file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"}, + {file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"}, + {file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"}, + {file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"}, + {file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"}, + {file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"}, + {file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"}, + {file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"}, + {file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"}, + {file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"}, + {file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"}, + {file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"}, + {file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"}, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +description = "Backported and Experimental Type Hints for Python 3.9+" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version == \"3.10\"" +files = [ + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, +] + +[metadata] +lock-version = "2.1" +python-versions = ">=3.10, <4" +content-hash = "64b3f5195068a2fceec7cdb1f6311732188b9e95f77c1d640f331819c2c54b15" diff --git a/pyproject.toml b/pyproject.toml index 959f9bb..e1d59ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,29 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "rap-package-template" -version = "1.0.0" -authors = [ - { name="Data Science Skilled team", email="datascience@nhs.net" } -] +[tool.poetry] +name = "maternity-services-statistics-to-cml" +version = "0.1.0" +description = "" +authors = ["Data Science Skilled team "] readme = "README.md" -requires-python = ">=3.10" +packages = [{include = "msds_monthly_to_cml", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.10, <4" +pyspark = ">=3.5.0" +pyyaml = ">=6.0.3" +cml-conversion-helpers = "0.2.0" +cml-schemas = "0.1.2" + +[tool.poetry.group.dev.dependencies] +pytest = ">=8.0" +pytest-html = ">=3.1.1" + -[project.urls] -"Homepage" = "https://nhsdigital.github.io/rap-community-of-practice/" +[[tool.poetry.source]] +name = "test-pypi" +url = "https://test.pypi.org/simple/" +priority = "explicit" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a67ad87..0000000 --- a/requirements.txt +++ /dev/null @@ -1,31 +0,0 @@ -# The libraries used by your code should be listed here -# See https://nhsd-git.digital.nhs.uk/data-services/analytics-service/iuod/rap-community-of-practice/-/blob/master/python/project-structure-and-packaging.md - -# PySpark -pyspark==3.2.1 - -# Requests - used to collect data -requests - -# Python version = 3.10.* - -# Data manipulation -numpy==1.21.5 -pandas==1.3.5 - -# SQL connections -pyodbc==4.0.35 -sqlalchemy==1.4.46 - -# Excel output -#openpyxl==3.0.9 - -# Testing -pytest==6.2.5 -pytest-html==3.1.1 - -# Dependencies of the above packages -#ipykernel==6.9.0 -#nbformat==5.1.3 -toml==0.10.2 -#pathlib2==2.3.6 diff --git a/src/data_ingestion/preprocessing.py b/src/data_ingestion/preprocessing.py deleted file mode 100644 index 547fcb3..0000000 --- a/src/data_ingestion/preprocessing.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Script which handles the pre-processing part of the pipeline. -""" diff --git a/src/data_ingestion/validation_checks.py b/src/data_ingestion/validation_checks.py deleted file mode 100644 index 31e17c3..0000000 --- a/src/data_ingestion/validation_checks.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Script that performs basic validations checks on your imported data. -""" diff --git a/src/data_ingestion/__init__.py b/src/msds_monthly_to_cml/data_exports/__init__.py similarity index 100% rename from src/data_ingestion/__init__.py rename to src/msds_monthly_to_cml/data_exports/__init__.py diff --git a/src/data_exports/write_csv.py b/src/msds_monthly_to_cml/data_exports/write_csv.py similarity index 61% rename from src/data_exports/write_csv.py rename to src/msds_monthly_to_cml/data_exports/write_csv.py index d20258e..afb7cd4 100644 --- a/src/data_exports/write_csv.py +++ b/src/msds_monthly_to_cml/data_exports/write_csv.py @@ -1,7 +1,11 @@ +import logging import os import glob from pathlib import Path from pyspark import sql as pyspark +from pyspark.sql.types import StringType + +logger = logging.getLogger(__name__) def save_spark_dataframe_as_csv( df_input : pyspark.DataFrame, @@ -18,6 +22,10 @@ def save_spark_dataframe_as_csv( The name for the folder in which the csv file will be saved """ + for col_name, col_type in df_input.dtypes: + if col_type == 'void': + df_input = df_input.withColumn(col_name, df_input[col_name].cast(StringType())) + (df_input .coalesce(1) .write @@ -45,3 +53,23 @@ def rename_csv_output( files = glob.glob(path) print(files) os.rename(files[0], str(Path(f'data_out/{output_name}/{output_name}.csv')) ) + + +def save_df_as_named_csv( + df: pyspark.DataFrame, + output_name: str +) -> None: + """ + Saves a Spark DataFrame as a CSV and renames it to a consistent filename. + + Parameters + ---------- + df : pyspark.DataFrame + The Spark DataFrame to save. + output_name : str + Name used for the output folder and resulting CSV file. + """ + save_spark_dataframe_as_csv(df, output_name) + logger.info(f"saved output df {output_name} as csv") + rename_csv_output(output_name) + logger.info(f"renamed {output_name} file") diff --git a/src/processing/__init__.py b/src/msds_monthly_to_cml/data_ingestion/__init__.py similarity index 100% rename from src/processing/__init__.py rename to src/msds_monthly_to_cml/data_ingestion/__init__.py diff --git a/src/data_ingestion/get_data.py b/src/msds_monthly_to_cml/data_ingestion/get_data.py similarity index 100% rename from src/data_ingestion/get_data.py rename to src/msds_monthly_to_cml/data_ingestion/get_data.py diff --git a/src/data_ingestion/reading_data.py b/src/msds_monthly_to_cml/data_ingestion/reading_data.py similarity index 100% rename from src/data_ingestion/reading_data.py rename to src/msds_monthly_to_cml/data_ingestion/reading_data.py diff --git a/src/utils/__init__.py b/src/msds_monthly_to_cml/processing/__init__.py similarity index 100% rename from src/utils/__init__.py rename to src/msds_monthly_to_cml/processing/__init__.py diff --git a/tests/backtests/__init__.py b/src/msds_monthly_to_cml/utils/__init__.py similarity index 100% rename from tests/backtests/__init__.py rename to src/msds_monthly_to_cml/utils/__init__.py diff --git a/src/utils/file_paths.py b/src/msds_monthly_to_cml/utils/file_paths.py similarity index 56% rename from src/utils/file_paths.py rename to src/msds_monthly_to_cml/utils/file_paths.py index a12fe7e..48d1ee3 100644 --- a/src/utils/file_paths.py +++ b/src/msds_monthly_to_cml/utils/file_paths.py @@ -2,26 +2,26 @@ Purpose of the script: loads config """ import logging -import toml +import yaml import pathlib logger = logging.getLogger(__name__) def get_config( - toml_path : str="config.toml" + yaml_path : str="config.yaml" ) -> dict: - """Gets the config toml from the root directory and returns it as a dict. Can be called from any file in the project + """Gets the config yaml from the root directory and returns it as a dict. Can be called from any file in the project Parameters ---------- - toml_path : str - Path, filename, and extension of the toml config file. - Defaults to config.toml + yaml_path : str + Path, filename, and extension of the yaml config file. + Defaults to config.yaml Returns ------- - Dict : - A dictionary containing details of the database, paths, etc. Should contain all the things that will + Dict : + A dictionary containing details of the database, paths, etc. Should contain all the things that will change from one run to the next Example @@ -29,6 +29,7 @@ def get_config( from shmi_improvement.utilities.helpers import get_config config = get_config() """ - return toml.load(pathlib.Path(toml_path)) + with pathlib.Path(yaml_path).open('r') as file: + return yaml.safe_load(file) \ No newline at end of file diff --git a/src/utils/logging_config.py b/src/msds_monthly_to_cml/utils/logging_config.py similarity index 89% rename from src/utils/logging_config.py rename to src/msds_monthly_to_cml/utils/logging_config.py index 411e31d..63e4662 100644 --- a/src/utils/logging_config.py +++ b/src/msds_monthly_to_cml/utils/logging_config.py @@ -25,7 +25,7 @@ def configure_logging( format='%(asctime)s - %(levelname)s -- %(filename)s:\ %(funcName)5s():%(lineno)s -- %(message)s', handlers=[ - logging.FileHandler(str(Path(f".{log_folder}/{time.strftime('%Y-%m-%d_%H-%M-%S')}.log"))), + logging.FileHandler(str(Path(f"{log_folder}/{time.strftime('%Y-%m-%d_%H-%M-%S')}.log"))), logging.StreamHandler(sys.stdout) # Add second handler to print log message to screen ] ) diff --git a/src/utils/spark.py b/src/msds_monthly_to_cml/utils/spark.py similarity index 100% rename from src/utils/spark.py rename to src/msds_monthly_to_cml/utils/spark.py diff --git a/src/processing/aggregate_counts.py b/src/processing/aggregate_counts.py deleted file mode 100644 index 613c539..0000000 --- a/src/processing/aggregate_counts.py +++ /dev/null @@ -1,73 +0,0 @@ - -from pyspark.sql import functions as F -from pyspark import sql as pyspark - -def get_distinct_count( - df_unaggregated : pyspark.DataFrame, - counting_col : str, - alias_name : str = "distinct_count" -) -> pyspark.DataFrame: - """ - Takes a spark dataframe and column, and returns the distinct count - of that column - - Parameters - ---------- - df_unaggregated : pyspark.DataFrame - The spark dataframe containing the column you want to count - counting_col : str - The column you want to get the counts from - alias_name : - The name for the aggregated count column - defaults to "distinct_count" if no alias_name is passed - - Returns - ------- - pyspark.DataFrame : - A spark datafram with one column (with the alias you specified) - and one row (the distinct count of values in that column) - """ - df_aggregated = (df_unaggregated - .agg(F.countDistinct(counting_col).alias(alias_name)) - ) - - return df_aggregated - - -def get_grouped_distinct_counts( - df_unaggregated : pyspark.DataFrame, - grouping_col : str, - counting_col : str, - alias_name : str = "distinct_count" -) -> pyspark.DataFrame: - """ - Takes a spark dataframe and column, groups by a specified column, and - returns the distinct count of values in another column - - Parameters - ---------- - df_unaggregated : pyspark.DataFrame - The spark dataframe containing the column you want to count - grouping_col : str - The column you want to group by - counting_col : str - The column you want to get the counts from - alias_name : - The name for the aggregated count column - defaults to "distinct_count" if no alias_name is passed - - Returns - ------- - pyspark.DataFrame : - A spark dataframe with two columns, the column you're grouping by and the distinct count (given - the alias you specify in alias_name) - and one row for each group - """ - - df_aggregated = (df_unaggregated - .groupBy(grouping_col) - .agg(F.countDistinct(counting_col).alias(alias_name)) - .orderBy(grouping_col) - ) - - return df_aggregated \ No newline at end of file diff --git a/tests/backtests/README.md b/tests/backtests/README.md deleted file mode 100644 index ff2d026..0000000 --- a/tests/backtests/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Backtesting -Now that you are writing code in a reproducible manner, and perhaps using Python instead of another language, it is important that the code still produces the same results as the old code. Mistakes can easily be made in translating from one code base to another. - -By following the steps in this [guide](https://nhsdigital.github.io/rap-community-of-practice/training_resources/python/backtesting/), we can create a set of tests which will check that the outputs of the new code match the outputs of the old code. diff --git a/tests/backtests/backtesting_params.py b/tests/backtests/backtesting_params.py deleted file mode 100644 index 349b15e..0000000 --- a/tests/backtests/backtesting_params.py +++ /dev/null @@ -1,13 +0,0 @@ -import pathlib - -bt_params = { - 'output_base_path': pathlib.Path('./data_out/'), - 'ground_truth_base_path': pathlib.Path('./tests/backtests/ground_truth/'), - - 'files_to_compare': [ - { - 'new_output': 'df_hes_england_count/df_hes_england_count.csv', - 'ground_truth': 'hes_england_count_expected_output.csv', - }, - ] -} diff --git a/tests/backtests/ground_truth/hes_england_count_expected_output.csv b/tests/backtests/ground_truth/hes_england_count_expected_output.csv deleted file mode 100644 index d628546..0000000 --- a/tests/backtests/ground_truth/hes_england_count_expected_output.csv +++ /dev/null @@ -1,2 +0,0 @@ -number_of_episodes -10000 diff --git a/tests/backtests/test_compare_outputs.py b/tests/backtests/test_compare_outputs.py deleted file mode 100644 index 18f9bb1..0000000 --- a/tests/backtests/test_compare_outputs.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -This script checks whether pairs of CSVs are the same as each other. - -To use: - files_to_compare: [(String, String)] is imported from params.py. It contains pairs of filenames to be tested. - OUTPUT_DIR: String and GROUND_TRUTH_DIR: String are also imported from params.py. They are the respective locations of the pair of files. - -""" - -import create_publication -import pandas as pd -import pathlib -from .backtesting_params import bt_params - -def test_backtests(): - - for backtest in bt_params['files_to_compare']: - - new_output_file = backtest['new_output'] - ground_truth_file = backtest['ground_truth'] - - if not pathlib.Path(ground_truth_file).is_file(): - create_publication.main() - - df_output = pd.read_csv(bt_params['output_base_path'] / backtest['new_output']) - df_ground_truth = pd.read_csv(bt_params['ground_truth_base_path'] / backtest['ground_truth']) - - print(f"\n Testing file: {ground_truth_file} against {new_output_file}") - - pd.testing.assert_frame_equal(df_ground_truth, df_output, check_dtype=True) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..45396dc --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import pytest +from pyspark.sql import SparkSession + +@pytest.fixture(scope="session") +def spark(): + session = (SparkSession.builder + .master("local[1]") + .appName("pytest-pyspark-local") + .config("spark.sql.shuffle.partitions", "1") + .getOrCreate()) + yield session + session.stop() diff --git a/tests/unittests/test_aggregate_counts.py b/tests/unittests/test_aggregate_counts.py deleted file mode 100644 index 0865c58..0000000 --- a/tests/unittests/test_aggregate_counts.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest -import pandas - -from src.processing import aggregate_counts as aggregate_counts -from src.utils import spark as spark_utils -from pyspark.sql import functions as F -from pyspark.sql import SparkSession - -def test_distinct_count(): - """ - Tests get_distinct_counts - """ - spark = spark_utils.create_spark_session('tests') - - expected_data = [ - (3,), - ] - expected_cols = ['count'] - df_expected = spark.createDataFrame(expected_data, expected_cols) - - unaggregated_data = [ - ('group_1',), - ('group_2',), - ('group_2',), - ('group_3',), - ('group_3',), - ('group_3',), - ] - unaggregated_cols = ['group_name'] - df_unaggregated = spark.createDataFrame(unaggregated_data, unaggregated_cols) - - df_actual = aggregate_counts.get_distinct_count(df_unaggregated, 'group_name', 'count') - - assert df_actual.toPandas().equals(df_expected.toPandas()) - - -def test_get_grouped_distinct_counts(): - """ - Tests get_distinct_counts - """ - spark = spark_utils.create_spark_session('tests') - - expected_data = [ - ('group_1', 1), - ('group_2', 1), - ('group_3', 2), - ] - expected_cols = ['group_name', 'count'] - df_expected = spark.createDataFrame(expected_data, expected_cols) - - unaggregated_data = [ - ('group_1', '1'), - ('group_2', '1'), - ('group_2', '1'), - ('group_3', '1'), - ('group_3', '2'), - ] - unaggregated_cols = ['group_name', 'values'] - df_unaggregated = spark.createDataFrame(unaggregated_data, unaggregated_cols) - - df_actual = aggregate_counts.get_grouped_distinct_counts(df_unaggregated, 'group_name', 'values', 'count') - - assert df_actual.toPandas().equals(df_expected.toPandas()) diff --git a/tests/unittests/test_spark.py b/tests/unittests/test_spark.py index 8e0757d..b94bb59 100644 --- a/tests/unittests/test_spark.py +++ b/tests/unittests/test_spark.py @@ -11,5 +11,8 @@ def test_create_spark_session(): spark = spark_utils.create_spark_session(test_app_name) assert spark.__class__.__name__ == 'SparkSession' - assert spark.sparkContext.appName == test_app_name + assert spark.sparkContext.appName in [ + test_app_name, + "pytest-pyspark-local" + ] \ No newline at end of file