Start adding linking

2023-06-04 21:47:02 +09:30
parent bd7e3590c0
commit befce9f60a
2 changed files with 149 additions and 68 deletions
--- a/src/link.rs
+++ b/src/link.rs
@@ -0,0 +1,123 @@
+// Given encounter + service file, assign an encounter to each service with the given rules
+
+// Algorithm:
+// 1. Read all linking rules into memory
+// 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked
+// 3. For each service, check if there's an encounter that matches the linking rules
+//      Preferably do this without needing to continually scan though all encounters
+
+// Data spec:
+// Linking rules specify match columns, date columns, source number column, target-source number column
+//      Match columns can be anything. Must come in a pair of source column=target column.
+//      Date columns must again be a pair, and include a look back/forward range
+//      Source number column is the column containing the source id (e.g. encounter number)
+//      Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number)
+
+use std::{
+    collections::HashMap,
+    io::{Read, Write},
+};
+
+use itertools::Itertools;
+
+pub struct MatchColumn {
+    source_column: String,
+    target_column: String,
+}
+
+pub struct DateMatchColumn {
+    source_column: String,
+    target_column: String,
+    search_back_days: i32,
+    search_forward_days: i32,
+}
+
+pub struct LinkingRule {
+    match_columns: Vec<MatchColumn>,
+    date_match_columns: Vec<DateMatchColumn>,
+}
+
+pub struct ProcessLinkingRule {
+    linking_rules: Vec<LinkingRule>,
+    source_number_column: String,
+    target_source_number_column: String,
+}
+
+// TODO: return thiserror or something rather than anyhow
+pub fn link(
+    // TODO: Make these readers/writers not coupled with csv reader/writer
+    source_reader: &mut csv::Reader<impl Read>,
+    target_reader: &mut csv::Reader<impl Read>,
+    linking_rule: ProcessLinkingRule,
+    linked_writer: &mut csv::Writer<impl Write>,
+) -> anyhow::Result<()> {
+    let mut source_columns: Vec<&String> = linking_rule
+        .linking_rules
+        .iter()
+        .flat_map(|rule| {
+            rule.match_columns
+                .iter()
+                .map(|match_column| &match_column.source_column)
+                .collect::<Vec<&String>>()
+        })
+        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
+        .unique()
+        .collect();
+    let mut source_date_columns: Vec<&String> = linking_rule
+        .linking_rules
+        .iter()
+        .flat_map(|rule| {
+            rule.date_match_columns
+                .iter()
+                .map(|match_column| &match_column.source_column)
+                .collect::<Vec<&String>>()
+        })
+        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
+        .unique()
+        .collect();
+    // Indexes of encounter ids for the given match column values (index in vec = index in source_columns)
+    // i.e List of Map of match column values -> source id, or source id with given values for each match column
+    // TODO: Can save more memory by storing values in the match columns in a vec of vecs
+    // Note: not as memory efficient as just continually scanning through encounter file each time,
+    // but it's way faster and will scale better
+    let mut source_indexes: Vec<HashMap<String, Vec<usize>>> =
+        vec![HashMap::new(); source_columns.len()];
+    let mut source_ids: Vec<String> = Vec::new();
+    // TODO: Merge with source_indexes?
+    //      Also store the actual date value rather than string, so we
+    //      don't need to convert as much later
+    let mut source_dates: Vec<HashMap<String, Vec<usize>>>;
+    for source_record in source_reader.deserialize() {
+        let source_record: HashMap<String, String> = source_record?;
+        let current_idx = source_ids.len();
+        // Make indexes of parts we need.
+        source_ids.push(
+            source_record
+                .get(&linking_rule.source_number_column)
+                .unwrap()
+                .clone(),
+        );
+        for (i, source_column) in source_columns.iter().enumerate() {
+            let source_column_value = source_record.get(*source_column);
+            if source_column_value.is_none() || source_column_value.unwrap().is_empty() {
+                continue;
+            }
+            let source_column_value = source_column_value.unwrap();
+            source_indexes[i]
+                .entry(source_column_value.clone())
+                .or_insert(Vec::new())
+                .push(current_idx);
+        }
+    }
+
+    for target_record in target_reader.deserialize() {
+        let target_record: HashMap<String, String> = target_record?;
+
+        // For each target record, get the source records that match each criteria in the match columns,
+        // then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing
+        // is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can
+        // sort this to make it easier to find dates within forward/back lookup
+    }
+
+    Ok(())
+}