Files
ingey/src/products/create_products.rs

110 lines
4.0 KiB
Rust

use std::{collections::HashMap, path::PathBuf};
use anyhow::anyhow;
use chrono::NaiveDateTime;
use itertools::Itertools;
// inluding dsl works better for completion with rust analyzer
use polars::lazy::dsl::*;
use polars::prelude::*;
use serde::Serialize;
use super::csv::{read_definitions, Definition, FileJoin, SourceType};
// TODO: Polars suggests this, but docs suggest it doesn't have very good platform support
//use jemallocator::Jemalloc;
// #[global_allocator]
// static GLOBAL: Jemalloc = Jemalloc;
#[derive(Debug, Serialize, Default)]
struct Product {
// Parse datetime from string: https://rust-lang-nursery.github.io/rust-cookbook/datetime/parse.html#parse-string-into-datetime-struct
// TODO: Serialisers.
start_date_time: NaiveDateTime,
end_date_time: NaiveDateTime,
encounter_start_date_time: Option<NaiveDateTime>,
encounter: Option<String>,
service: Option<String>,
transfer: Option<String>,
quantity: Option<f64>,
duration: Option<f64>,
actual_charge: Option<f64>,
standard_cost: Option<f64>,
// TODO: Enum this?
day_of_stay: Option<u8>,
source_allocated_amount: Option<f64>,
}
pub struct InputFile {
pub file_path: PathBuf,
pub joins: Vec<FileJoin>,
}
pub fn create_products_polars(
definitions_path: PathBuf,
inputs: HashMap<SourceType, InputFile>,
output_path: PathBuf,
) -> anyhow::Result<()> {
let definitions = read_definitions(&mut csv::Reader::from_path(definitions_path)?)?;
let definitions = definitions.values().collect_vec();
for definition in definitions {
build_polars(definition, &inputs, &output_path)?;
}
Ok(())
}
//TODO: This will iterate over the file multiple times, which could technically be
// slower than just going through the file once since reading from disk is slower
// than reading from memory. However, reading from
// Also, we can use a custom definition format that is translated from the
// ppm format, so things like constraints/filters are one thing, and way more generic
// (i.e. filter can be based on a join between files).
pub fn build_polars(
definition: &Definition,
inputs: &HashMap<SourceType, InputFile>,
output_path: &PathBuf,
) -> anyhow::Result<()> {
// 1. Apply filters to limit encounters
let filter = definition
.filters
.iter()
.map(|filter| {
let col = col(&filter.field);
match filter.filter_type {
super::csv::FilterType::Equal => col.eq(lit(filter.value.clone())),
super::csv::FilterType::GreaterThan => col.gt(lit(filter.value.clone())),
super::csv::FilterType::GreaterThanOrEqualTo => {
col.gt_eq(lit(filter.value.clone()))
}
super::csv::FilterType::LessThan => col.lt(lit(filter.value.clone())),
super::csv::FilterType::LessThanOrEqualTo => col.lt_eq(lit(filter.value.clone())),
super::csv::FilterType::NotEqualTo => col.neq(lit(filter.value.clone())),
}
})
.reduce(|prev, next| prev.and(next));
let input_file = inputs
.get(&definition.source_type)
.ok_or(anyhow!("Failed to find valid file"))?;
let reader = LazyCsvReader::new(&input_file.file_path)
.has_header(true)
.finish()?;
// TODO: Do joins based on usage in definitions components and filters. Ideally just join the columns that are actually wanted.
// Can do this by first going over each component/filter, and
let mut filtered = match filter {
Some(filter) => reader.filter(filter),
None => reader,
}
.with_streaming(true)
.collect()?;
// TODO: Now for each of the filtered records, create a new record that is the built record, based on the components
// quantity, etc. from the definition
let mut file = std::fs::File::create(output_path).unwrap();
// TODO: Don't use filtered, but the results that outputs created product columns
CsvWriter::new(&mut file).finish(&mut filtered)?;
Ok(())
}