Files
ingey/src/link.rs
2023-06-04 21:47:02 +09:30

124 lines
4.9 KiB
Rust

// Given encounter + service file, assign an encounter to each service with the given rules
// Algorithm:
// 1. Read all linking rules into memory
// 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked
// 3. For each service, check if there's an encounter that matches the linking rules
// Preferably do this without needing to continually scan though all encounters
// Data spec:
// Linking rules specify match columns, date columns, source number column, target-source number column
// Match columns can be anything. Must come in a pair of source column=target column.
// Date columns must again be a pair, and include a look back/forward range
// Source number column is the column containing the source id (e.g. encounter number)
// Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number)
use std::{
collections::HashMap,
io::{Read, Write},
};
use itertools::Itertools;
pub struct MatchColumn {
source_column: String,
target_column: String,
}
pub struct DateMatchColumn {
source_column: String,
target_column: String,
search_back_days: i32,
search_forward_days: i32,
}
pub struct LinkingRule {
match_columns: Vec<MatchColumn>,
date_match_columns: Vec<DateMatchColumn>,
}
pub struct ProcessLinkingRule {
linking_rules: Vec<LinkingRule>,
source_number_column: String,
target_source_number_column: String,
}
// TODO: return thiserror or something rather than anyhow
pub fn link(
// TODO: Make these readers/writers not coupled with csv reader/writer
source_reader: &mut csv::Reader<impl Read>,
target_reader: &mut csv::Reader<impl Read>,
linking_rule: ProcessLinkingRule,
linked_writer: &mut csv::Writer<impl Write>,
) -> anyhow::Result<()> {
let mut source_columns: Vec<&String> = linking_rule
.linking_rules
.iter()
.flat_map(|rule| {
rule.match_columns
.iter()
.map(|match_column| &match_column.source_column)
.collect::<Vec<&String>>()
})
// TODO: Check this filters out correctly, as it's filtering on a reference, not a value
.unique()
.collect();
let mut source_date_columns: Vec<&String> = linking_rule
.linking_rules
.iter()
.flat_map(|rule| {
rule.date_match_columns
.iter()
.map(|match_column| &match_column.source_column)
.collect::<Vec<&String>>()
})
// TODO: Check this filters out correctly, as it's filtering on a reference, not a value
.unique()
.collect();
// Indexes of encounter ids for the given match column values (index in vec = index in source_columns)
// i.e List of Map of match column values -> source id, or source id with given values for each match column
// TODO: Can save more memory by storing values in the match columns in a vec of vecs
// Note: not as memory efficient as just continually scanning through encounter file each time,
// but it's way faster and will scale better
let mut source_indexes: Vec<HashMap<String, Vec<usize>>> =
vec![HashMap::new(); source_columns.len()];
let mut source_ids: Vec<String> = Vec::new();
// TODO: Merge with source_indexes?
// Also store the actual date value rather than string, so we
// don't need to convert as much later
let mut source_dates: Vec<HashMap<String, Vec<usize>>>;
for source_record in source_reader.deserialize() {
let source_record: HashMap<String, String> = source_record?;
let current_idx = source_ids.len();
// Make indexes of parts we need.
source_ids.push(
source_record
.get(&linking_rule.source_number_column)
.unwrap()
.clone(),
);
for (i, source_column) in source_columns.iter().enumerate() {
let source_column_value = source_record.get(*source_column);
if source_column_value.is_none() || source_column_value.unwrap().is_empty() {
continue;
}
let source_column_value = source_column_value.unwrap();
source_indexes[i]
.entry(source_column_value.clone())
.or_insert(Vec::new())
.push(current_idx);
}
}
for target_record in target_reader.deserialize() {
let target_record: HashMap<String, String> = target_record?;
// For each target record, get the source records that match each criteria in the match columns,
// then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing
// is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can
// sort this to make it easier to find dates within forward/back lookup
}
Ok(())
}