ingey/src/link.rs

// Given encounter + service file, assign an encounter to each service with the given rules

// Algorithm:
// 1. Read all linking rules into memory
// 2. Scan through list of encounters and create indexes for match columns, based on what columns get linked
// 3. For each service, check if there's an encounter that matches the linking rules
//      Preferably do this without needing to continually scan though all encounters

// Data spec:
// Linking rules specify match columns, date columns, source number column, target-source number column
//      Match columns can be anything. Must come in a pair of source column=target column.
//      Date columns must again be a pair, and include a look back/forward range
//      Source number column is the column containing the source id (e.g. encounter number)
//      Target-source number column is the column in the target file that contains the source id (e.g. linked encounter number)

use std::{
    collections::HashMap,
    io::{Read, Write},
};

use itertools::Itertools;

pub struct MatchColumn {
    source_column: String,
    target_column: String,
}

pub struct DateMatchColumn {
    source_column: String,
    target_column: String,
    search_back_days: i32,
    search_forward_days: i32,
}

pub struct LinkingRule {
    match_columns: Vec<MatchColumn>,
    date_match_columns: Vec<DateMatchColumn>,
}

pub struct ProcessLinkingRule {
    linking_rules: Vec<LinkingRule>,
    source_number_column: String,
    target_source_number_column: String,
}

// TODO: return thiserror or something rather than anyhow
pub fn link(
    // TODO: Make these readers/writers not coupled with csv reader/writer
    source_reader: &mut csv::Reader<impl Read>,
    target_reader: &mut csv::Reader<impl Read>,
    linking_rule: ProcessLinkingRule,
    linked_writer: &mut csv::Writer<impl Write>,
) -> anyhow::Result<()> {
    let mut source_columns: Vec<&String> = linking_rule
        .linking_rules
        .iter()
        .flat_map(|rule| {
            rule.match_columns
                .iter()
                .map(|match_column| &match_column.source_column)
                .collect::<Vec<&String>>()
        })
        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
        .unique()
        .collect();
    let mut source_date_columns: Vec<&String> = linking_rule
        .linking_rules
        .iter()
        .flat_map(|rule| {
            rule.date_match_columns
                .iter()
                .map(|match_column| &match_column.source_column)
                .collect::<Vec<&String>>()
        })
        // TODO: Check this filters out correctly, as it's filtering on a reference, not a value
        .unique()
        .collect();
    // Indexes of encounter ids for the given match column values (index in vec = index in source_columns)
    // i.e List of Map of match column values -> source id, or source id with given values for each match column
    // TODO: Can save more memory by storing values in the match columns in a vec of vecs
    // Note: not as memory efficient as just continually scanning through encounter file each time,
    // but it's way faster and will scale better
    let mut source_indexes: Vec<HashMap<String, Vec<usize>>> =
        vec![HashMap::new(); source_columns.len()];
    let mut source_ids: Vec<String> = Vec::new();
    // TODO: Merge with source_indexes?
    //      Also store the actual date value rather than string, so we
    //      don't need to convert as much later
    let mut source_dates: Vec<HashMap<String, Vec<usize>>>;
    for source_record in source_reader.deserialize() {
        let source_record: HashMap<String, String> = source_record?;
        let current_idx = source_ids.len();
        // Make indexes of parts we need.
        source_ids.push(
            source_record
                .get(&linking_rule.source_number_column)
                .unwrap()
                .clone(),
        );
        for (i, source_column) in source_columns.iter().enumerate() {
            let source_column_value = source_record.get(*source_column);
            if source_column_value.is_none() || source_column_value.unwrap().is_empty() {
                continue;
            }
            let source_column_value = source_column_value.unwrap();
            source_indexes[i]
                .entry(source_column_value.clone())
                .or_insert(Vec::new())
                .push(current_idx);
        }
    }

    for target_record in target_reader.deserialize() {
        let target_record: HashMap<String, String> = target_record?;

        // For each target record, get the source records that match each criteria in the match columns,
        // then filter down to the date columns... how to do this quickly (without scanning again). Easiest thing
        // is to just store a list of a list of all the dates + source ids. Not perfectly efficient, but can
        // sort this to make it easier to find dates within forward/back lookup
    }

    Ok(())
}