From befce9f60a6b39f98b87c0ffef25af8acd4b3bc6 Mon Sep 17 00:00:00 2001 From: Piv <18462828+Piv200@users.noreply.github.com> Date: Sun, 4 Jun 2023 21:47:02 +0930 Subject: [PATCH] Start adding linking --- src/lib.rs | 94 +++++++++++---------------------------- src/link.rs | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 68 deletions(-) create mode 100644 src/link.rs diff --git a/src/lib.rs b/src/lib.rs index 14e3685..5663f51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,8 @@ pub use self::products::create_products; mod shared_models; pub use self::shared_models::*; +pub mod link; + #[no_mangle] pub extern "C" fn move_money_from_text( rules: *const c_char, @@ -22,22 +24,10 @@ pub extern "C" fn move_money_from_text( use_numeric_accounts: bool, ) -> *mut c_char { let mut output_writer = csv::Writer::from_writer(vec![]); - let safe_rules = unsafe { - assert!(!rules.is_null()); - CStr::from_ptr(rules) - }; - let safe_lines = unsafe { - assert!(!lines.is_null()); - CStr::from_ptr(lines) - }; - let safe_accounts = unsafe { - assert!(!accounts.is_null()); - CStr::from_ptr(accounts) - }; - let safe_cost_centres = unsafe { - assert!(!cost_centres.is_null()); - CStr::from_ptr(cost_centres) - }; + let safe_rules = unwrap_c_char(rules); + let safe_lines = unwrap_c_char(lines); + let safe_accounts = unwrap_c_char(accounts); + let safe_cost_centres = unwrap_c_char(cost_centres); move_money( &mut csv::Reader::from_reader(safe_rules.to_bytes()), &mut csv::Reader::from_reader(safe_lines.to_bytes()), @@ -81,30 +71,12 @@ pub extern "C" fn allocate_overheads_from_text( account_type: *const c_char, use_numeric_accounts: bool, ) -> *mut c_char { - let lines = unsafe { - assert!(!lines.is_null()); - CStr::from_ptr(lines) - }; - let accounts = unsafe { - assert!(!accounts.is_null()); - CStr::from_ptr(accounts) - }; - let allocation_statistics = unsafe { - assert!(!allocation_statistics.is_null()); - CStr::from_ptr(allocation_statistics) - }; - let areas = unsafe { - assert!(!areas.is_null()); - CStr::from_ptr(areas) - }; - let cost_centres = unsafe { - assert!(!cost_centres.is_null()); - CStr::from_ptr(cost_centres) - }; - let account_type = unsafe { - assert!(!account_type.is_null()); - CStr::from_ptr(account_type) - }; + let lines = unwrap_c_char(lines); + let accounts = unwrap_c_char(accounts); + let allocation_statistics = unwrap_c_char(allocation_statistics); + let areas = unwrap_c_char(areas); + let cost_centres = unwrap_c_char(cost_centres); + let account_type = unwrap_c_char(account_type); let mut output_writer = csv::Writer::from_writer(vec![]); reciprocal_allocation( &mut csv::Reader::from_reader(lines.to_bytes()), @@ -142,34 +114,13 @@ pub extern "C" fn allocate_overheads_from_text_to_file( use_numeric_accounts: bool, show_from: bool, ) { - let lines = unsafe { - assert!(!lines.is_null()); - CStr::from_ptr(lines) - }; - let accounts = unsafe { - assert!(!accounts.is_null()); - CStr::from_ptr(accounts) - }; - let allocation_statistics = unsafe { - assert!(!allocation_statistics.is_null()); - CStr::from_ptr(allocation_statistics) - }; - let areas = unsafe { - assert!(!areas.is_null()); - CStr::from_ptr(areas) - }; - let cost_centres = unsafe { - assert!(!cost_centres.is_null()); - CStr::from_ptr(cost_centres) - }; - let account_type = unsafe { - assert!(!account_type.is_null()); - CStr::from_ptr(account_type) - }; - let output_path = unsafe { - assert!(!output_path.is_null()); - CStr::from_ptr(output_path) - }; + let lines = unwrap_c_char(lines); + let accounts = unwrap_c_char(accounts); + let allocation_statistics = unwrap_c_char(allocation_statistics); + let areas = unwrap_c_char(areas); + let cost_centres = unwrap_c_char(cost_centres); + let account_type = unwrap_c_char(account_type); + let output_path = unwrap_c_char(output_path); reciprocal_allocation( &mut csv::Reader::from_reader(lines.to_bytes()), &mut csv::Reader::from_reader(accounts.to_bytes()), @@ -187,6 +138,13 @@ pub extern "C" fn allocate_overheads_from_text_to_file( .expect("Failed to allocate overheads"); } +fn unwrap_c_char<'a>(s: *const c_char) -> &'a CStr { + unsafe { + assert!(!s.is_null()); + CStr::from_ptr(s) + } +} + #[no_mangle] pub extern "C" fn allocate_overheads_from_text_free(s: *mut c_char) { unsafe { diff --git a/src/link.rs b/src/link.rs new file mode 100644 index 0000000..1dbca32 --- /dev/null +++ b/src/link.rs @@ -0,0 +1,123 @@ +// Given encounter + service file, assign an encounter to each service with the given rules + +// Algorithm: +// 1. Read all linking rules into memory +// 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked +// 3. For each service, check if there's an encounter that matches the linking rules +// Preferably do this without needing to continually scan though all encounters + +// Data spec: +// Linking rules specify match columns, date columns, source number column, target-source number column +// Match columns can be anything. Must come in a pair of source column=target column. +// Date columns must again be a pair, and include a look back/forward range +// Source number column is the column containing the source id (e.g. encounter number) +// Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number) + +use std::{ + collections::HashMap, + io::{Read, Write}, +}; + +use itertools::Itertools; + +pub struct MatchColumn { + source_column: String, + target_column: String, +} + +pub struct DateMatchColumn { + source_column: String, + target_column: String, + search_back_days: i32, + search_forward_days: i32, +} + +pub struct LinkingRule { + match_columns: Vec, + date_match_columns: Vec, +} + +pub struct ProcessLinkingRule { + linking_rules: Vec, + source_number_column: String, + target_source_number_column: String, +} + +// TODO: return thiserror or something rather than anyhow +pub fn link( + // TODO: Make these readers/writers not coupled with csv reader/writer + source_reader: &mut csv::Reader, + target_reader: &mut csv::Reader, + linking_rule: ProcessLinkingRule, + linked_writer: &mut csv::Writer, +) -> anyhow::Result<()> { + let mut source_columns: Vec<&String> = linking_rule + .linking_rules + .iter() + .flat_map(|rule| { + rule.match_columns + .iter() + .map(|match_column| &match_column.source_column) + .collect::>() + }) + // TODO: Check this filters out correctly, as it's filtering on a reference, not a value + .unique() + .collect(); + let mut source_date_columns: Vec<&String> = linking_rule + .linking_rules + .iter() + .flat_map(|rule| { + rule.date_match_columns + .iter() + .map(|match_column| &match_column.source_column) + .collect::>() + }) + // TODO: Check this filters out correctly, as it's filtering on a reference, not a value + .unique() + .collect(); + // Indexes of encounter ids for the given match column values (index in vec = index in source_columns) + // i.e List of Map of match column values -> source id, or source id with given values for each match column + // TODO: Can save more memory by storing values in the match columns in a vec of vecs + // Note: not as memory efficient as just continually scanning through encounter file each time, + // but it's way faster and will scale better + let mut source_indexes: Vec>> = + vec![HashMap::new(); source_columns.len()]; + let mut source_ids: Vec = Vec::new(); + // TODO: Merge with source_indexes? + // Also store the actual date value rather than string, so we + // don't need to convert as much later + let mut source_dates: Vec>>; + for source_record in source_reader.deserialize() { + let source_record: HashMap = source_record?; + let current_idx = source_ids.len(); + // Make indexes of parts we need. + source_ids.push( + source_record + .get(&linking_rule.source_number_column) + .unwrap() + .clone(), + ); + for (i, source_column) in source_columns.iter().enumerate() { + let source_column_value = source_record.get(*source_column); + if source_column_value.is_none() || source_column_value.unwrap().is_empty() { + continue; + } + let source_column_value = source_column_value.unwrap(); + source_indexes[i] + .entry(source_column_value.clone()) + .or_insert(Vec::new()) + .push(current_idx); + } + } + + for target_record in target_reader.deserialize() { + let target_record: HashMap = target_record?; + + // For each target record, get the source records that match each criteria in the match columns, + // then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing + // is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can + // sort this to make it easier to find dates within forward/back lookup + } + + Ok(()) +}