Merge branch 'linking' into 'main'

Start adding linking See merge request vato007/coster-rs!4
2023-07-27 09:36:28 +00:00
parent bd7e3590c0 befce9f60a
commit 76096639b2
2 changed files with 149 additions and 68 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,6 +13,8 @@ pub use self::products::create_products;
 mod shared_models;
 pub use self::shared_models::*;
 pub mod link;
 #[no_mangle]
 pub extern "C" fn move_money_from_text(
    rules: *const c_char,
@@ -22,22 +24,10 @@ pub extern "C" fn move_money_from_text(
    use_numeric_accounts: bool,
 ) -> *mut c_char {
    let mut output_writer = csv::Writer::from_writer(vec![]);
-    let safe_rules = unsafe {
+    let safe_rules = unwrap_c_char(rules);
-        assert!(!rules.is_null());
+    let safe_lines = unwrap_c_char(lines);
-        CStr::from_ptr(rules)
+    let safe_accounts = unwrap_c_char(accounts);
-    };
+    let safe_cost_centres = unwrap_c_char(cost_centres);
    let safe_lines = unsafe {
        assert!(!lines.is_null());
        CStr::from_ptr(lines)
    };
    let safe_accounts = unsafe {
        assert!(!accounts.is_null());
        CStr::from_ptr(accounts)
    };
    let safe_cost_centres = unsafe {
        assert!(!cost_centres.is_null());
        CStr::from_ptr(cost_centres)
    };
    move_money(
        &mut csv::Reader::from_reader(safe_rules.to_bytes()),
        &mut csv::Reader::from_reader(safe_lines.to_bytes()),
@@ -81,30 +71,12 @@ pub extern "C" fn allocate_overheads_from_text(
    account_type: *const c_char,
    use_numeric_accounts: bool,
 ) -> *mut c_char {
-    let lines = unsafe {
+    let lines = unwrap_c_char(lines);
-        assert!(!lines.is_null());
+    let accounts = unwrap_c_char(accounts);
-        CStr::from_ptr(lines)
+    let allocation_statistics = unwrap_c_char(allocation_statistics);
-    };
+    let areas = unwrap_c_char(areas);
-    let accounts = unsafe {
+    let cost_centres = unwrap_c_char(cost_centres);
-        assert!(!accounts.is_null());
+    let account_type = unwrap_c_char(account_type);
        CStr::from_ptr(accounts)
    };
    let allocation_statistics = unsafe {
        assert!(!allocation_statistics.is_null());
        CStr::from_ptr(allocation_statistics)
    };
    let areas = unsafe {
        assert!(!areas.is_null());
        CStr::from_ptr(areas)
    };
    let cost_centres = unsafe {
        assert!(!cost_centres.is_null());
        CStr::from_ptr(cost_centres)
    };
    let account_type = unsafe {
        assert!(!account_type.is_null());
        CStr::from_ptr(account_type)
    };
    let mut output_writer = csv::Writer::from_writer(vec![]);
    reciprocal_allocation(
        &mut csv::Reader::from_reader(lines.to_bytes()),
@@ -142,34 +114,13 @@ pub extern "C" fn allocate_overheads_from_text_to_file(
    use_numeric_accounts: bool,
    show_from: bool,
 ) {
-    let lines = unsafe {
+    let lines = unwrap_c_char(lines);
-        assert!(!lines.is_null());
+    let accounts = unwrap_c_char(accounts);
-        CStr::from_ptr(lines)
+    let allocation_statistics = unwrap_c_char(allocation_statistics);
-    };
+    let areas = unwrap_c_char(areas);
-    let accounts = unsafe {
+    let cost_centres = unwrap_c_char(cost_centres);
-        assert!(!accounts.is_null());
+    let account_type = unwrap_c_char(account_type);
-        CStr::from_ptr(accounts)
+    let output_path = unwrap_c_char(output_path);
    };
    let allocation_statistics = unsafe {
        assert!(!allocation_statistics.is_null());
        CStr::from_ptr(allocation_statistics)
    };
    let areas = unsafe {
        assert!(!areas.is_null());
        CStr::from_ptr(areas)
    };
    let cost_centres = unsafe {
        assert!(!cost_centres.is_null());
        CStr::from_ptr(cost_centres)
    };
    let account_type = unsafe {
        assert!(!account_type.is_null());
        CStr::from_ptr(account_type)
    };
    let output_path = unsafe {
        assert!(!output_path.is_null());
        CStr::from_ptr(output_path)
    };
    reciprocal_allocation(
        &mut csv::Reader::from_reader(lines.to_bytes()),
        &mut csv::Reader::from_reader(accounts.to_bytes()),
@@ -187,6 +138,13 @@ pub extern "C" fn allocate_overheads_from_text_to_file(
    .expect("Failed to allocate overheads");
 }
 fn unwrap_c_char<'a>(s: *const c_char) -> &'a CStr {
    unsafe {
        assert!(!s.is_null());
        CStr::from_ptr(s)
    }
 }
 #[no_mangle]
 pub extern "C" fn allocate_overheads_from_text_free(s: *mut c_char) {
    unsafe {
--- a/src/link.rs
+++ b/src/link.rs
@@ -0,0 +1,123 @@
 // Given encounter + service file, assign an encounter to each service with the given rules
 // Algorithm:
 // 1. Read all linking rules into memory
 // 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked
 // 3. For each service, check if there's an encounter that matches the linking rules
 //      Preferably do this without needing to continually scan though all encounters
 // Data spec:
 // Linking rules specify match columns, date columns, source number column, target-source number column
 //      Match columns can be anything. Must come in a pair of source column=target column.
 //      Date columns must again be a pair, and include a look back/forward range
 //      Source number column is the column containing the source id (e.g. encounter number)
 //      Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number)
 use std::{
    collections::HashMap,
    io::{Read, Write},
 };
 use itertools::Itertools;
 pub struct MatchColumn {
    source_column: String,
    target_column: String,
 }
 pub struct DateMatchColumn {
    source_column: String,
    target_column: String,
    search_back_days: i32,
    search_forward_days: i32,
 }
 pub struct LinkingRule {
    match_columns: Vec<MatchColumn>,
    date_match_columns: Vec<DateMatchColumn>,
 }
 pub struct ProcessLinkingRule {
    linking_rules: Vec<LinkingRule>,
    source_number_column: String,
    target_source_number_column: String,
 }
 // TODO: return thiserror or something rather than anyhow
 pub fn link(
    // TODO: Make these readers/writers not coupled with csv reader/writer
    source_reader: &mut csv::Reader<impl Read>,
    target_reader: &mut csv::Reader<impl Read>,
    linking_rule: ProcessLinkingRule,
    linked_writer: &mut csv::Writer<impl Write>,
 ) -> anyhow::Result<()> {
    let mut source_columns: Vec<&String> = linking_rule
        .linking_rules
        .iter()
        .flat_map(|rule| {
            rule.match_columns
                .iter()
                .map(|match_column| &match_column.source_column)
                .collect::<Vec<&String>>()
        })
        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
        .unique()
        .collect();
    let mut source_date_columns: Vec<&String> = linking_rule
        .linking_rules
        .iter()
        .flat_map(|rule| {
            rule.date_match_columns
                .iter()
                .map(|match_column| &match_column.source_column)
                .collect::<Vec<&String>>()
        })
        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
        .unique()
        .collect();
    // Indexes of encounter ids for the given match column values (index in vec = index in source_columns)
    // i.e List of Map of match column values -> source id, or source id with given values for each match column
    // TODO: Can save more memory by storing values in the match columns in a vec of vecs
    // Note: not as memory efficient as just continually scanning through encounter file each time,
    // but it's way faster and will scale better
    let mut source_indexes: Vec<HashMap<String, Vec<usize>>> =
        vec![HashMap::new(); source_columns.len()];
    let mut source_ids: Vec<String> = Vec::new();
    // TODO: Merge with source_indexes?
    //      Also store the actual date value rather than string, so we
    //      don't need to convert as much later
    let mut source_dates: Vec<HashMap<String, Vec<usize>>>;
    for source_record in source_reader.deserialize() {
        let source_record: HashMap<String, String> = source_record?;
        let current_idx = source_ids.len();
        // Make indexes of parts we need.
        source_ids.push(
            source_record
                .get(&linking_rule.source_number_column)
                .unwrap()
                .clone(),
        );
        for (i, source_column) in source_columns.iter().enumerate() {
            let source_column_value = source_record.get(*source_column);
            if source_column_value.is_none() || source_column_value.unwrap().is_empty() {
                continue;
            }
            let source_column_value = source_column_value.unwrap();
            source_indexes[i]
                .entry(source_column_value.clone())
                .or_insert(Vec::new())
                .push(current_idx);
        }
    }
    for target_record in target_reader.deserialize() {
        let target_record: HashMap<String, String> = target_record?;
        // For each target record, get the source records that match each criteria in the match columns,
        // then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing
        // is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can
        // sort this to make it easier to find dates within forward/back lookup
    }
    Ok(())
 }