Refactor product creator, remove threading for writing to disk

2023-03-11 10:55:41 +10:30
parent 363c972b71
commit 7cd893cbf8
5 changed files with 250 additions and 256 deletions
--- a/src/products/create_products.rs
+++ b/src/products/create_products.rs
@@ -0,0 +1,139 @@
+use core::panic;
+use std::{
+    collections::HashMap,
+    io::{Read, Write},
+    sync::mpsc,
+    thread,
+};
+
+use chrono::NaiveDateTime;
+use csv::Position;
+use serde::Serialize;
+
+use super::csv::{read_definitions, BuildFrom, ConstraintType, Definition};
+
+#[derive(Debug, Serialize, Default)]
+struct Product {
+    // Parse datetime from string: https://rust-lang-nursery.github.io/rust-cookbook/datetime/parse.html#parse-string-into-datetime-struct
+    // TODO: Serialisers.
+    start_date_time: NaiveDateTime,
+    end_date_time: NaiveDateTime,
+    encounter_start_date_time: Option<NaiveDateTime>,
+    encounter: Option<String>,
+    service: Option<String>,
+    transfer: Option<String>,
+    quantity: Option<f64>,
+    duration: Option<f64>,
+    actual_charge: Option<f64>,
+    standard_cost: Option<f64>,
+    // TODO: Enum this?
+    day_of_stay: Option<u8>,
+    source_allocated_amount: Option<f64>,
+}
+
+// TODO: Build from linked dataset is pretty hard, it potentially requires knowing everything abuot the previous year's
+// cosing run (BSCO, Dataset_Encounter_Cache, etc).
+pub fn create_products<D, E, S, T, P, Di, O>(
+    definitions: &mut csv::Reader<D>,
+    encounters: &mut csv::Reader<E>,
+    services: &mut csv::Reader<S>,
+    transfers: &mut csv::Reader<T>,
+    procedures: &mut csv::Reader<P>,
+    diagnoses: &mut csv::Reader<Di>,
+    // TODO: Looks kind of bad, any other way around it? I'd rather not have to depend on crossbeam as well
+    output: &mut csv::Writer<O>,
+    // TODO: Default to 10 million or something sane
+    batch_size: usize,
+) -> anyhow::Result<()>
+where
+    D: Read,
+    E: Read,
+    S: Read,
+    T: Read,
+    P: Read,
+    Di: Read,
+    // TODO: Looks kind of bad, any other way around it? I'd rather not have to depend on crossbeam as well
+    O: Write + Send + 'static,
+{
+    let mut all_definitions: HashMap<String, Definition> = read_definitions(definitions)?;
+    // Partition the rules by the build from type, so that we'll run all the rules at once for a particular file, which should be much faster
+    // then opening files and scanning one at a time. Could also do batches in files
+
+    let mut mapped_definitions: HashMap<BuildFrom, Vec<Definition>> = HashMap::new();
+    for (_, definition) in all_definitions {
+        mapped_definitions
+            .entry(definition.build_from)
+            .or_insert(vec![])
+            .push(definition);
+    }
+
+    // Now whenever we want to produce a built service, just write it to tx.
+
+    // Note that rust csv can seek to a certain position, so we can read in a batch from a reader, then
+    // seek to that position in the reader (or position 0) if we couldn't find a particular record.
+    // Alternatively, we could store an index of all records (e.g. encounter numbers) that map to their position in the reader,
+    // so we can quickly seek to the appropriate index and read the record.
+    // https://docs.rs/csv/latest/csv/struct.Reader.html#method.seek
+    // Store encounter positions in file, so that later when we read through transfers/whatever we can easily
+    // seak to the correct position quickly in case we have a cache miss
+    let mut encounter_positions: HashMap<String, Position> = HashMap::new();
+
+    // TODO: Alternative to storing encounter positions would be to sort portions of the file bits at a time (I think it's called a merge sort?).
+
+    // TODO: Try with and without rayon, should be able to help I think as we're going through so much data sequentially,
+    // although we're still likely to be bottlenecked by just write-speed
+    let mut encounters = encounters;
+    let headers = encounters.headers()?.clone();
+
+    for encounter in encounters.records() {
+        let encounter = encounter?;
+        let position = encounter.position().unwrap();
+        let encounter: HashMap<String, String> = encounter.deserialize(Some(&headers))?;
+        encounter_positions.insert(
+            encounter.get("EncounterNumber").unwrap().to_string(),
+            position.clone(),
+        );
+        // TODO: For each encounter definition, check this fits the filter criteria/constraints,
+        // and
+        let definitions = mapped_definitions.get(&BuildFrom::Encounter).unwrap();
+        for definition in definitions {
+            let matching_filter = (definition.filters.is_empty()
+                || definition.filters.iter().any(|filter| {
+                    let field = encounter.get(filter.field.as_str());
+                    if field.is_none() {
+                        return false;
+                    }
+                    let field = field.unwrap();
+                    if filter.equal {
+                        return filter.value == *field;
+                    } else {
+                        return filter.value != *field;
+                    }
+                }))
+                && (definition.constraints.is_empty()
+                    || definition.constraints.iter().any(|constraint| {
+                        let field = encounter.get(constraint.field.as_str());
+                        if field.is_none() {
+                            return false;
+                        }
+                        let field = field.unwrap();
+                        // TODO: Is this just number/datetime? Should probably be an enum? It's not, seems to be E in the test data
+                        let field_type = &constraint.source_type;
+                        match constraint.constraint_type {
+                            ConstraintType::Equal => *field == constraint.value,
+                            _ => false,
+                        }
+                    }));
+            if matching_filter {
+                // Generate the service code
+            }
+        }
+
+        // TODO: Generate the built service
+        output.serialize(Product::default());
+    }
+
+    // Now do the same with transfers, services, etc, referencing the encounter reader by using the
+    // indexes in encounter_positions
+    Ok(())
+}