use std::{collections::HashMap, path::PathBuf}; use anyhow::anyhow; use chrono::NaiveDateTime; use itertools::Itertools; // inluding dsl works better for completion with rust analyzer use polars::lazy::dsl::*; use polars::prelude::*; use serde::Serialize; use super::csv::{read_definitions, Definition, FileJoin, SourceType}; // TODO: Polars suggests this, but docs suggest it doesn't have very good platform support //use jemallocator::Jemalloc; // #[global_allocator] // static GLOBAL: Jemalloc = Jemalloc; #[derive(Debug, Serialize, Default)] struct Product { // Parse datetime from string: https://rust-lang-nursery.github.io/rust-cookbook/datetime/parse.html#parse-string-into-datetime-struct // TODO: Serialisers. start_date_time: NaiveDateTime, end_date_time: NaiveDateTime, encounter_start_date_time: Option, encounter: Option, service: Option, transfer: Option, quantity: Option, duration: Option, actual_charge: Option, standard_cost: Option, // TODO: Enum this? day_of_stay: Option, source_allocated_amount: Option, } pub struct InputFile { pub file_path: PathBuf, pub joins: Vec, } pub fn create_products_polars( definitions_path: PathBuf, inputs: HashMap, output_path: PathBuf, ) -> anyhow::Result<()> { let definitions = read_definitions(&mut csv::Reader::from_path(definitions_path)?)?; let definitions = definitions.values().collect_vec(); for definition in definitions { build_polars(definition, &inputs, &output_path)?; } Ok(()) } //TODO: This will iterate over the file multiple times, which could technically be // slower than just going through the file once since reading from disk is slower // than reading from memory. However, reading from // Also, we can use a custom definition format that is translated from the // ppm format, so things like constraints/filters are one thing, and way more generic // (i.e. filter can be based on a join between files). pub fn build_polars( definition: &Definition, inputs: &HashMap, output_path: &PathBuf, ) -> anyhow::Result<()> { // 1. Apply filters to limit encounters let filter = definition .filters .iter() .map(|filter| { let col = col(&filter.field); match filter.filter_type { super::csv::FilterType::Equal => col.eq(lit(filter.value.clone())), super::csv::FilterType::GreaterThan => col.gt(lit(filter.value.clone())), super::csv::FilterType::GreaterThanOrEqualTo => { col.gt_eq(lit(filter.value.clone())) } super::csv::FilterType::LessThan => col.lt(lit(filter.value.clone())), super::csv::FilterType::LessThanOrEqualTo => col.lt_eq(lit(filter.value.clone())), super::csv::FilterType::NotEqualTo => col.neq(lit(filter.value.clone())), } }) .reduce(|prev, next| prev.and(next)); let input_file = inputs .get(&definition.source_type) .ok_or(anyhow!("Failed to find valid file"))?; let reader = LazyCsvReader::new(&input_file.file_path) .has_header(true) .finish()?; // TODO: Do joins based on usage in definitions components and filters. Ideally just join the columns that are actually wanted. // Can do this by first going over each component/filter, and let mut filtered = match filter { Some(filter) => reader.filter(filter), None => reader, } .with_streaming(true) .collect()?; // TODO: Now for each of the filtered records, create a new record that is the built record, based on the components // quantity, etc. from the definition let mut file = std::fs::File::create(output_path).unwrap(); // TODO: Don't use filtered, but the results that outputs created product columns CsvWriter::new(&mut file).finish(&mut filtered)?; Ok(()) }