Start adding linking

2023-06-04 21:47:02 +09:30
parent bd7e3590c0
commit befce9f60a
2 changed files with 149 additions and 68 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,6 +13,8 @@ pub use self::products::create_products;
 mod shared_models;
 pub use self::shared_models::*;

+pub mod link;
+
 #[no_mangle]
 pub extern "C" fn move_money_from_text(
    rules: *const c_char,
@@ -22,22 +24,10 @@ pub extern "C" fn move_money_from_text(
    use_numeric_accounts: bool,
 ) -> *mut c_char {
    let mut output_writer = csv::Writer::from_writer(vec![]);
-    let safe_rules = unsafe {
-        assert!(!rules.is_null());
-        CStr::from_ptr(rules)
-    };
-    let safe_lines = unsafe {
-        assert!(!lines.is_null());
-        CStr::from_ptr(lines)
-    };
-    let safe_accounts = unsafe {
-        assert!(!accounts.is_null());
-        CStr::from_ptr(accounts)
-    };
-    let safe_cost_centres = unsafe {
-        assert!(!cost_centres.is_null());
-        CStr::from_ptr(cost_centres)
-    };
+    let safe_rules = unwrap_c_char(rules);
+    let safe_lines = unwrap_c_char(lines);
+    let safe_accounts = unwrap_c_char(accounts);
+    let safe_cost_centres = unwrap_c_char(cost_centres);
    move_money(
        &mut csv::Reader::from_reader(safe_rules.to_bytes()),
        &mut csv::Reader::from_reader(safe_lines.to_bytes()),
@@ -81,30 +71,12 @@ pub extern "C" fn allocate_overheads_from_text(
    account_type: *const c_char,
    use_numeric_accounts: bool,
 ) -> *mut c_char {
-    let lines = unsafe {
-        assert!(!lines.is_null());
-        CStr::from_ptr(lines)
-    };
-    let accounts = unsafe {
-        assert!(!accounts.is_null());
-        CStr::from_ptr(accounts)
-    };
-    let allocation_statistics = unsafe {
-        assert!(!allocation_statistics.is_null());
-        CStr::from_ptr(allocation_statistics)
-    };
-    let areas = unsafe {
-        assert!(!areas.is_null());
-        CStr::from_ptr(areas)
-    };
-    let cost_centres = unsafe {
-        assert!(!cost_centres.is_null());
-        CStr::from_ptr(cost_centres)
-    };
-    let account_type = unsafe {
-        assert!(!account_type.is_null());
-        CStr::from_ptr(account_type)
-    };
+    let lines = unwrap_c_char(lines);
+    let accounts = unwrap_c_char(accounts);
+    let allocation_statistics = unwrap_c_char(allocation_statistics);
+    let areas = unwrap_c_char(areas);
+    let cost_centres = unwrap_c_char(cost_centres);
+    let account_type = unwrap_c_char(account_type);
    let mut output_writer = csv::Writer::from_writer(vec![]);
    reciprocal_allocation(
        &mut csv::Reader::from_reader(lines.to_bytes()),
@@ -142,34 +114,13 @@ pub extern "C" fn allocate_overheads_from_text_to_file(
    use_numeric_accounts: bool,
    show_from: bool,
 ) {
-    let lines = unsafe {
-        assert!(!lines.is_null());
-        CStr::from_ptr(lines)
-    };
-    let accounts = unsafe {
-        assert!(!accounts.is_null());
-        CStr::from_ptr(accounts)
-    };
-    let allocation_statistics = unsafe {
-        assert!(!allocation_statistics.is_null());
-        CStr::from_ptr(allocation_statistics)
-    };
-    let areas = unsafe {
-        assert!(!areas.is_null());
-        CStr::from_ptr(areas)
-    };
-    let cost_centres = unsafe {
-        assert!(!cost_centres.is_null());
-        CStr::from_ptr(cost_centres)
-    };
-    let account_type = unsafe {
-        assert!(!account_type.is_null());
-        CStr::from_ptr(account_type)
-    };
-    let output_path = unsafe {
-        assert!(!output_path.is_null());
-        CStr::from_ptr(output_path)
-    };
+    let lines = unwrap_c_char(lines);
+    let accounts = unwrap_c_char(accounts);
+    let allocation_statistics = unwrap_c_char(allocation_statistics);
+    let areas = unwrap_c_char(areas);
+    let cost_centres = unwrap_c_char(cost_centres);
+    let account_type = unwrap_c_char(account_type);
+    let output_path = unwrap_c_char(output_path);
    reciprocal_allocation(
        &mut csv::Reader::from_reader(lines.to_bytes()),
        &mut csv::Reader::from_reader(accounts.to_bytes()),
@@ -187,6 +138,13 @@ pub extern "C" fn allocate_overheads_from_text_to_file(
    .expect("Failed to allocate overheads");
 }

+fn unwrap_c_char<'a>(s: *const c_char) -> &'a CStr {
+    unsafe {
+        assert!(!s.is_null());
+        CStr::from_ptr(s)
+    }
+}
+
 #[no_mangle]
 pub extern "C" fn allocate_overheads_from_text_free(s: *mut c_char) {
    unsafe {
--- a/src/link.rs
+++ b/src/link.rs
@@ -0,0 +1,123 @@
+// Given encounter + service file, assign an encounter to each service with the given rules
+
+// Algorithm:
+// 1. Read all linking rules into memory
+// 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked
+// 3. For each service, check if there's an encounter that matches the linking rules
+//      Preferably do this without needing to continually scan though all encounters
+
+// Data spec:
+// Linking rules specify match columns, date columns, source number column, target-source number column
+//      Match columns can be anything. Must come in a pair of source column=target column.
+//      Date columns must again be a pair, and include a look back/forward range
+//      Source number column is the column containing the source id (e.g. encounter number)
+//      Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number)
+
+use std::{
+    collections::HashMap,
+    io::{Read, Write},
+};
+
+use itertools::Itertools;
+
+pub struct MatchColumn {
+    source_column: String,
+    target_column: String,
+}
+
+pub struct DateMatchColumn {
+    source_column: String,
+    target_column: String,
+    search_back_days: i32,
+    search_forward_days: i32,
+}
+
+pub struct LinkingRule {
+    match_columns: Vec<MatchColumn>,
+    date_match_columns: Vec<DateMatchColumn>,
+}
+
+pub struct ProcessLinkingRule {
+    linking_rules: Vec<LinkingRule>,
+    source_number_column: String,
+    target_source_number_column: String,
+}
+
+// TODO: return thiserror or something rather than anyhow
+pub fn link(
+    // TODO: Make these readers/writers not coupled with csv reader/writer
+    source_reader: &mut csv::Reader<impl Read>,
+    target_reader: &mut csv::Reader<impl Read>,
+    linking_rule: ProcessLinkingRule,
+    linked_writer: &mut csv::Writer<impl Write>,
+) -> anyhow::Result<()> {
+    let mut source_columns: Vec<&String> = linking_rule
+        .linking_rules
+        .iter()
+        .flat_map(|rule| {
+            rule.match_columns
+                .iter()
+                .map(|match_column| &match_column.source_column)
+                .collect::<Vec<&String>>()
+        })
+        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
+        .unique()
+        .collect();
+    let mut source_date_columns: Vec<&String> = linking_rule
+        .linking_rules
+        .iter()
+        .flat_map(|rule| {
+            rule.date_match_columns
+                .iter()
+                .map(|match_column| &match_column.source_column)
+                .collect::<Vec<&String>>()
+        })
+        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
+        .unique()
+        .collect();
+    // Indexes of encounter ids for the given match column values (index in vec = index in source_columns)
+    // i.e List of Map of match column values -> source id, or source id with given values for each match column
+    // TODO: Can save more memory by storing values in the match columns in a vec of vecs
+    // Note: not as memory efficient as just continually scanning through encounter file each time,
+    // but it's way faster and will scale better
+    let mut source_indexes: Vec<HashMap<String, Vec<usize>>> =
+        vec![HashMap::new(); source_columns.len()];
+    let mut source_ids: Vec<String> = Vec::new();
+    // TODO: Merge with source_indexes?
+    //      Also store the actual date value rather than string, so we
+    //      don't need to convert as much later
+    let mut source_dates: Vec<HashMap<String, Vec<usize>>>;
+    for source_record in source_reader.deserialize() {
+        let source_record: HashMap<String, String> = source_record?;
+        let current_idx = source_ids.len();
+        // Make indexes of parts we need.
+        source_ids.push(
+            source_record
+                .get(&linking_rule.source_number_column)
+                .unwrap()
+                .clone(),
+        );
+        for (i, source_column) in source_columns.iter().enumerate() {
+            let source_column_value = source_record.get(*source_column);
+            if source_column_value.is_none() || source_column_value.unwrap().is_empty() {
+                continue;
+            }
+            let source_column_value = source_column_value.unwrap();
+            source_indexes[i]
+                .entry(source_column_value.clone())
+                .or_insert(Vec::new())
+                .push(current_idx);
+        }
+    }
+
+    for target_record in target_reader.deserialize() {
+        let target_record: HashMap<String, String> = target_record?;
+
+        // For each target record, get the source records that match each criteria in the match columns,
+        // then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing
+        // is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can
+        // sort this to make it easier to find dates within forward/back lookup
+    }
+
+    Ok(())
+}