use std::{ collections::HashMap, io::{Read, Write}, }; use chrono::NaiveDateTime; use csv::Position; // inluding dsl works better for completion with rust analyzer use polars::lazy::dsl::*; use polars::prelude::*; use serde::Serialize; use super::csv::{read_definitions, BuildFrom, Component, ConstraintType, Definition, SourceType}; // TODO: Polars suggests this, but docs suggest it doesn't have very good platform support //use jemallocator::Jemalloc; // #[global_allocator] // static GLOBAL: Jemalloc = Jemalloc; #[derive(Debug, Serialize, Default)] struct Product { // Parse datetime from string: https://rust-lang-nursery.github.io/rust-cookbook/datetime/parse.html#parse-string-into-datetime-struct // TODO: Serialisers. start_date_time: NaiveDateTime, end_date_time: NaiveDateTime, encounter_start_date_time: Option, encounter: Option, service: Option, transfer: Option, quantity: Option, duration: Option, actual_charge: Option, standard_cost: Option, // TODO: Enum this? day_of_stay: Option, source_allocated_amount: Option, } pub struct CreateProductInputs where E: Read, S: Read, T: Read, P: Read, Di: Read, { pub encounters: csv::Reader, pub services: csv::Reader, pub transfers: csv::Reader, pub procedures: csv::Reader

, pub diagnoses: csv::Reader, } pub fn create_products_polars( definitions_path: String, patients_path: String, encounters_path: String, services_path: String, transfers_path: String, procedures_path: String, diagnoses_path: String, output_path: String, ) -> anyhow::Result<()> { let mut all_definitions: HashMap = read_definitions(&mut csv::Reader::from_path(definitions_path)?)?; for (key, definition) in all_definitions { match definition.build_from { BuildFrom::Encounter => build_encounters_polars( definition, encounters_path.clone(), patients_path.clone(), output_path.clone(), ), BuildFrom::Service => todo!(), BuildFrom::Transfer => todo!(), BuildFrom::CodingProcedure => todo!(), BuildFrom::CodingDiagnosis => todo!(), BuildFrom::LinkedDataset => todo!(), BuildFrom::Revenue => todo!(), }?; } Ok(()) } // TODO: Build from linked dataset is pretty hard, it potentially requires knowing everything abuot the previous year's // cosing run (BSCO, Dataset_Encounter_Cache, etc). pub fn create_products( definitions: &mut csv::Reader, product_inputs: CreateProductInputs, // TODO: Looks kind of bad, any other way around it? I'd rather not have to depend on crossbeam as well output: &mut csv::Writer, // TODO: Default to 10 million or something sane batch_size: usize, ) -> anyhow::Result<()> where D: Read, E: Read, S: Read, T: Read, P: Read, Di: Read, // TODO: Looks kind of bad, any other way around it? I'd rather not have to depend on crossbeam as well O: Write + Send + 'static, { let mut all_definitions: HashMap = read_definitions(definitions)?; // Partition the rules by the build from type, so that we'll run all the rules at once for a particular file, which should be much faster // then opening files and scanning one at a time. Could also do batches in files let mut mapped_definitions: HashMap> = HashMap::new(); for (_, definition) in all_definitions { mapped_definitions .entry(definition.build_from) .or_insert(vec![]) .push(definition); } // Now whenever we want to produce a built service, just write it to tx. // Note that rust csv can seek to a certain position, so we can read in a batch from a reader, then // seek to that position in the reader (or position 0) if we couldn't find a particular record. // Alternatively, we could store an index of all records (e.g. encounter numbers) that map to their position in the reader, // so we can quickly seek to the appropriate index and read the record. // https://docs.rs/csv/latest/csv/struct.Reader.html#method.seek // Store encounter positions in file, so that later when we read through transfers/whatever we can easily // seak to the correct position quickly in case we have a cache miss let mut encounter_positions: HashMap = HashMap::new(); // TODO: Consider just using polars for this, can use streaming so don't need to worry about // running out of memory or being super efficient, can just focus on the code and let the query // optimiser sort things out (main reason I don't like this though is it may be inderterminate in // runtime speed, will just need to see how it performs I guess, so maybe just make a seperate implementation) // TODO: Alternative to storing encounter positions would be to sort portions of the file bits at a time (I think it's called a merge sort?). // TODO: Try with and without rayon, should be able to help I think as we're going through so much data sequentially, // although we're still likely to be bottlenecked by just write-speed let mut encounters = product_inputs.encounters; let headers = encounters.headers()?.clone(); for encounter in encounters.records() { let encounter = encounter?; let position = encounter.position().unwrap(); let encounter: HashMap = encounter.deserialize(Some(&headers))?; encounter_positions.insert( encounter.get("EncounterNumber").unwrap().to_string(), position.clone(), ); // TODO: For each encounter definition, check this fits the filter criteria/constraints, // and let definitions = mapped_definitions.get(&BuildFrom::Encounter).unwrap(); for definition in definitions { let matching_filter = (definition.filters.is_empty() || definition.filters.iter().any(|filter| { let field = encounter.get(filter.field.as_str()); if field.is_none() { return false; } let field = field.unwrap(); if filter.equal { filter.value == *field } else { filter.value != *field } })) && (definition.constraints.is_empty() || definition.constraints.iter().any(|constraint| { let field = encounter.get(constraint.field.as_str()); if field.is_none() { return false; } let field = field.unwrap(); // TODO: Is this just number/datetime? Should probably be an enum? It's not, seems to be E in the test data let field_type = &constraint.source_type; match constraint.constraint_type { ConstraintType::Equal => *field == constraint.value, _ => false, } })); if matching_filter { // Generate the service code } } // TODO: Generate the built service output.serialize(Product::default())?; } // Now do the same with transfers, services, etc, referencing the encounter reader by using the // indexes in encounter_positions Ok(()) } //TODO: This will iterate over the file multiple times, which could technically be // slower than just going through the file once since reading from disk is slower // than reading from memory. However, reading from pub fn build_encounters_polars( definition: Definition, encounters_path: String, patients_path: String, output_path: String, ) -> anyhow::Result<()> { // 1. Apply filters/constraints to limit encounters let filter = definition .filters .iter() .map(|filter| { // TODO: Filter field depends on type, as extra/classificiation need to append extra/classification // but how do we check this? let col = col(&filter.field); if filter.equal { col.eq(lit(filter.value.clone())) } else { col.neq(lit(filter.value.clone())) } }) .reduce(|prev, next| prev.and(next)) .unwrap(); let constraint = definition .constraints .iter() .map(|constraint| { let col = col(&constraint.field); // TODO: Might need to do a cast here match constraint.constraint_type { ConstraintType::Equal => col.eq(lit(constraint.value.clone())), ConstraintType::GreaterThan => col.gt(lit(constraint.value.clone())), ConstraintType::GreaterThanOrEqualTo => col.gt_eq(lit(constraint.value.clone())), ConstraintType::LessThan => col.lt(lit(constraint.value.clone())), ConstraintType::LessThanOrEqualTo => col.lt_eq(lit(constraint.value.clone())), ConstraintType::NotEqualTo => col.neq(lit(constraint.value.clone())), } }) .reduce(|prev, next| prev.and(next)) .unwrap(); // TODO: If constraints or components include patient field, then we need to join onto // the patient file. let mut reader = LazyCsvReader::new(encounters_path) .has_header(true) .finish()?; // TODO: Refactor me if definition .constraints .iter() .any(|c| c.source_type == SourceType::Patient) || definition .filters .iter() .any(|f| f.source_type == SourceType::Patient) || definition.components.iter().any(|c| match c { Component::Field(source, _) => source == "P", _ => false, }) { let patient_reader = LazyCsvReader::new(patients_path) .has_header(true) .finish()?; reader = reader.join( patient_reader, [col("Id")], [col("Patient_Id")], JoinArgs::new(JoinType::Inner), ); } let filtered = reader .filter(filter.and(constraint)) .with_streaming(true) .collect()?; // TODO: Now for each of the filtered records, create a new record that is the built record, based on the components // quantity, etc. from the definition let mut file = std::fs::File::create("output_path").unwrap(); let mut writer = CsvWriter::new(&mut file); // 2. Create the built services and write to disk // TODO: Kind of sucks we need to colelct into memory, see if there's a better way to do this later (otherwise can't use polars) // writer // .finish(&mut reader.with_streaming(true).collect()?) // .unwrap(); Ok(()) }