Start adding linking
This commit is contained in:
123
src/link.rs
Normal file
123
src/link.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
// Given encounter + service file, assign an encounter to each service with the given rules
|
||||
|
||||
// Algorithm:
|
||||
// 1. Read all linking rules into memory
|
||||
// 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked
|
||||
// 3. For each service, check if there's an encounter that matches the linking rules
|
||||
// Preferably do this without needing to continually scan though all encounters
|
||||
|
||||
// Data spec:
|
||||
// Linking rules specify match columns, date columns, source number column, target-source number column
|
||||
// Match columns can be anything. Must come in a pair of source column=target column.
|
||||
// Date columns must again be a pair, and include a look back/forward range
|
||||
// Source number column is the column containing the source id (e.g. encounter number)
|
||||
// Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number)
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{Read, Write},
|
||||
};
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
pub struct MatchColumn {
|
||||
source_column: String,
|
||||
target_column: String,
|
||||
}
|
||||
|
||||
pub struct DateMatchColumn {
|
||||
source_column: String,
|
||||
target_column: String,
|
||||
search_back_days: i32,
|
||||
search_forward_days: i32,
|
||||
}
|
||||
|
||||
pub struct LinkingRule {
|
||||
match_columns: Vec<MatchColumn>,
|
||||
date_match_columns: Vec<DateMatchColumn>,
|
||||
}
|
||||
|
||||
pub struct ProcessLinkingRule {
|
||||
linking_rules: Vec<LinkingRule>,
|
||||
source_number_column: String,
|
||||
target_source_number_column: String,
|
||||
}
|
||||
|
||||
// TODO: return thiserror or something rather than anyhow
|
||||
pub fn link(
|
||||
// TODO: Make these readers/writers not coupled with csv reader/writer
|
||||
source_reader: &mut csv::Reader<impl Read>,
|
||||
target_reader: &mut csv::Reader<impl Read>,
|
||||
linking_rule: ProcessLinkingRule,
|
||||
linked_writer: &mut csv::Writer<impl Write>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut source_columns: Vec<&String> = linking_rule
|
||||
.linking_rules
|
||||
.iter()
|
||||
.flat_map(|rule| {
|
||||
rule.match_columns
|
||||
.iter()
|
||||
.map(|match_column| &match_column.source_column)
|
||||
.collect::<Vec<&String>>()
|
||||
})
|
||||
// TODO: Check this filters out correctly, as it's filtering on a reference, not a value
|
||||
.unique()
|
||||
.collect();
|
||||
let mut source_date_columns: Vec<&String> = linking_rule
|
||||
.linking_rules
|
||||
.iter()
|
||||
.flat_map(|rule| {
|
||||
rule.date_match_columns
|
||||
.iter()
|
||||
.map(|match_column| &match_column.source_column)
|
||||
.collect::<Vec<&String>>()
|
||||
})
|
||||
// TODO: Check this filters out correctly, as it's filtering on a reference, not a value
|
||||
.unique()
|
||||
.collect();
|
||||
// Indexes of encounter ids for the given match column values (index in vec = index in source_columns)
|
||||
// i.e List of Map of match column values -> source id, or source id with given values for each match column
|
||||
// TODO: Can save more memory by storing values in the match columns in a vec of vecs
|
||||
// Note: not as memory efficient as just continually scanning through encounter file each time,
|
||||
// but it's way faster and will scale better
|
||||
let mut source_indexes: Vec<HashMap<String, Vec<usize>>> =
|
||||
vec![HashMap::new(); source_columns.len()];
|
||||
let mut source_ids: Vec<String> = Vec::new();
|
||||
// TODO: Merge with source_indexes?
|
||||
// Also store the actual date value rather than string, so we
|
||||
// don't need to convert as much later
|
||||
let mut source_dates: Vec<HashMap<String, Vec<usize>>>;
|
||||
for source_record in source_reader.deserialize() {
|
||||
let source_record: HashMap<String, String> = source_record?;
|
||||
let current_idx = source_ids.len();
|
||||
// Make indexes of parts we need.
|
||||
source_ids.push(
|
||||
source_record
|
||||
.get(&linking_rule.source_number_column)
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
for (i, source_column) in source_columns.iter().enumerate() {
|
||||
let source_column_value = source_record.get(*source_column);
|
||||
if source_column_value.is_none() || source_column_value.unwrap().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let source_column_value = source_column_value.unwrap();
|
||||
source_indexes[i]
|
||||
.entry(source_column_value.clone())
|
||||
.or_insert(Vec::new())
|
||||
.push(current_idx);
|
||||
}
|
||||
}
|
||||
|
||||
for target_record in target_reader.deserialize() {
|
||||
let target_record: HashMap<String, String> = target_record?;
|
||||
|
||||
// For each target record, get the source records that match each criteria in the match columns,
|
||||
// then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing
|
||||
// is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can
|
||||
// sort this to make it easier to find dates within forward/back lookup
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user