// Given encounter + service file, assign an encounter to each service with the given rules // Algorithm: // 1. Read all linking rules into memory // 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked // 3. For each service, check if there's an encounter that matches the linking rules // Preferably do this without needing to continually scan though all encounters // Data spec: // Linking rules specify match columns, date columns, source number column, target-source number column // Match columns can be anything. Must come in a pair of source column=target column. // Date columns must again be a pair, and include a look back/forward range // Source number column is the column containing the source id (e.g. encounter number) // Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number) use std::{ collections::HashMap, io::{Read, Write}, }; use itertools::Itertools; pub struct MatchColumn { source_column: String, target_column: String, } pub struct DateMatchColumn { source_column: String, target_column: String, search_back_days: i32, search_forward_days: i32, } pub struct LinkingRule { match_columns: Vec, date_match_columns: Vec, } pub struct ProcessLinkingRule { linking_rules: Vec, source_number_column: String, target_source_number_column: String, } // TODO: return thiserror or something rather than anyhow pub fn link( // TODO: Make these readers/writers not coupled with csv reader/writer source_reader: &mut csv::Reader, target_reader: &mut csv::Reader, linking_rule: ProcessLinkingRule, linked_writer: &mut csv::Writer, ) -> anyhow::Result<()> { let mut source_columns: Vec<&String> = linking_rule .linking_rules .iter() .flat_map(|rule| { rule.match_columns .iter() .map(|match_column| &match_column.source_column) .collect::>() }) // TODO: Check this filters out correctly, as it's filtering on a reference, not a value .unique() .collect(); let mut source_date_columns: Vec<&String> = linking_rule .linking_rules .iter() .flat_map(|rule| { rule.date_match_columns .iter() .map(|match_column| &match_column.source_column) .collect::>() }) // TODO: Check this filters out correctly, as it's filtering on a reference, not a value .unique() .collect(); // Indexes of encounter ids for the given match column values (index in vec = index in source_columns) // i.e List of Map of match column values -> source id, or source id with given values for each match column // TODO: Can save more memory by storing values in the match columns in a vec of vecs // Note: not as memory efficient as just continually scanning through encounter file each time, // but it's way faster and will scale better let mut source_indexes: Vec>> = vec![HashMap::new(); source_columns.len()]; let mut source_ids: Vec = Vec::new(); // TODO: Merge with source_indexes? // Also store the actual date value rather than string, so we // don't need to convert as much later let mut source_dates: Vec>>; for source_record in source_reader.deserialize() { let source_record: HashMap = source_record?; let current_idx = source_ids.len(); // Make indexes of parts we need. source_ids.push( source_record .get(&linking_rule.source_number_column) .unwrap() .clone(), ); for (i, source_column) in source_columns.iter().enumerate() { let source_column_value = source_record.get(*source_column); if source_column_value.is_none() || source_column_value.unwrap().is_empty() { continue; } let source_column_value = source_column_value.unwrap(); source_indexes[i] .entry(source_column_value.clone()) .or_insert(Vec::new()) .push(current_idx); } } for target_record in target_reader.deserialize() { let target_record: HashMap = target_record?; // For each target record, get the source records that match each criteria in the match columns, // then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing // is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can // sort this to make it easier to find dates within forward/back lookup } Ok(()) }