Refactor codebase into submodules

2023-01-29 21:37:07 +10:30
parent ba279c8c9b
commit 10723efb57
4 changed files with 464 additions and 450 deletions
--- a/src/move_money.rs
+++ b/src/move_money.rs
@@ -0,0 +1,283 @@
+use std::collections::HashMap;
+
+use itertools::Itertools;
+use serde::{Deserialize, Serialize};
+
+// TODO: Fix up these names, check if all is actually integrated into the strings
+#[derive(Debug, Deserialize)]
+struct CsvMovementRule {
+    #[serde(rename = "FromCC")]
+    // Need strings to further split later
+    from_departments: String,
+    to_departments: String,
+    all_from_departments: bool,
+    all_to_departments: bool,
+    from_accounts: String,
+    to_accounts: String,
+    all_from_accounts: bool,
+    all_to_accounts: bool,
+    amount: f64,
+    is_percent: Option<bool>,
+    is_separator: Option<bool>,
+}
+
+#[derive(Default)]
+pub struct MovementRule {
+    // If the vectors are empty, then it means 'all'
+    pub from_departments: Vec<String>,
+    pub to_departments: Vec<String>,
+    pub all_from_departments: bool,
+    pub all_to_departments: bool,
+    pub from_accounts: Vec<String>,
+    pub to_accounts: Vec<String>,
+    pub all_from_accounts: bool,
+    pub all_to_accounts: bool,
+    pub amount: f64,
+    pub is_percent: bool,
+    pub is_separator: bool,
+}
+
+impl MovementRule {
+    pub fn pass_break() -> MovementRule {
+        MovementRule {
+            is_separator: true,
+            ..MovementRule::default()
+        }
+    }
+
+    pub fn validate(&self) -> bool {
+        if self.from_departments.is_empty() && self.to_departments.is_empty() {
+            // Would be nice to have a decent message/error here as well
+            return false;
+        }
+        if self.is_percent && (self.amount < 0.0 || self.amount > 100.0) {
+            return false;
+        }
+        true
+    }
+}
+
+#[derive(Hash, Clone, Default, PartialEq, Eq)]
+pub struct Unit {
+    pub department: String,
+    pub account: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CsvCost {
+    account: String,
+    department: String,
+    value: f64,
+}
+
+pub fn move_money<R, L, O>(
+    rules_reader: csv::Reader<R>,
+    lines_reader: csv::Reader<L>,
+    output: csv::Writer<O>,
+    use_numeric_accounts: bool,
+) -> anyhow::Result<()>
+where
+    R: std::io::Read,
+    L: std::io::Read,
+    O: std::io::Write,
+{
+    let mut lines_reader = lines_reader;
+    let headers = lines_reader.headers()?;
+    let mut account_index = 0;
+    let mut department_index = 0;
+    for (index, field) in headers.iter().enumerate() {
+        if field.eq_ignore_ascii_case("account") {
+            account_index = index;
+        } else if field.eq_ignore_ascii_case("department") {
+            department_index = index;
+        }
+    }
+
+    let lines: HashMap<Unit, f64> = lines_reader
+        .records()
+        .map(|record| {
+            let record = record.unwrap();
+            let account = record.get(account_index).unwrap();
+            let department = record.get(department_index).unwrap();
+            let sum = record
+                .iter()
+                .enumerate()
+                .filter(|(i, _)| *i != account_index && *i != department_index)
+                .map(|(_, f)| f.parse::<f64>().unwrap())
+                .sum();
+            (
+                Unit {
+                    account: account.into(),
+                    department: department.into(),
+                },
+                sum,
+            )
+        })
+        .collect();
+
+    let all_accounts_sorted = if use_numeric_accounts {
+        lines
+            .keys()
+            .map(|key| key.account.clone().parse::<i32>().unwrap())
+            .sorted()
+            .map(|account| account.to_string())
+            .collect()
+    } else {
+        lines
+            .keys()
+            .map(|key| key.account.clone())
+            .sorted()
+            .collect()
+    };
+    let all_departments_sorted = lines
+        .keys()
+        .map(|key| key.department.clone())
+        .sorted()
+        .collect();
+    let mut rules_reader = rules_reader;
+    let mut rules: Vec<MovementRule> = vec![];
+    for result in rules_reader.deserialize() {
+        let movement_rule: CsvMovementRule = result?;
+        let from_accounts = extract_range(
+            movement_rule.from_accounts,
+            movement_rule.all_from_accounts,
+            &all_accounts_sorted,
+        );
+        let to_accounts = extract_range(
+            movement_rule.to_accounts,
+            movement_rule.all_to_accounts,
+            &all_accounts_sorted,
+        );
+        let from_departments = extract_range(
+            movement_rule.from_departments,
+            movement_rule.all_from_departments,
+            &all_departments_sorted,
+        );
+        let to_departments = extract_range(
+            movement_rule.to_departments,
+            movement_rule.all_to_departments,
+            &all_departments_sorted,
+        );
+        rules.push(MovementRule {
+            from_departments,
+            to_departments,
+            all_from_departments: movement_rule.all_from_departments,
+            all_to_departments: movement_rule.all_to_departments,
+            from_accounts,
+            to_accounts,
+            all_from_accounts: movement_rule.all_from_accounts,
+            all_to_accounts: movement_rule.all_to_accounts,
+            amount: movement_rule.amount,
+            is_percent: movement_rule.is_percent.unwrap_or(false),
+            is_separator: movement_rule.is_separator.unwrap_or(false),
+        })
+    }
+
+    // Then run move_money
+    let moved = move_money_2(lines, &rules);
+    let mut output = output;
+
+    // Ouput the list moved moneys
+    for money in moved {
+        output.serialize(CsvCost {
+            account: money.0.account,
+            department: money.0.department,
+            value: money.1,
+        })?;
+    }
+
+    Ok(())
+}
+
+fn extract_range(range: String, all: bool, options: &Vec<String>) -> Vec<String> {
+    if all {
+        return vec![];
+    }
+    let split_range: Vec<&str> = range.split("-").collect();
+    if split_range.len() == 1 {
+        return vec![range];
+    }
+    let start_index = options
+        .iter()
+        .enumerate()
+        .find(|option| option.1 == split_range[0])
+        .map(|start| start.0);
+    let end_index = options
+        .iter()
+        .enumerate()
+        .find(|option| option.1 == split_range[1])
+        .map(|end| end.0);
+    if let Some(start) = start_index {
+        if let Some(end) = end_index {
+            return Vec::from(&options[start..end + 1]);
+        } else {
+            return vec![options[start].clone()];
+        }
+    } else if let Some(end) = end_index {
+        return vec![options[end].clone()];
+    }
+    return vec![];
+}
+
+// Approach 1:
+// Use math (linear algebra) to move between departments. Memory/computationally it's equivalent
+// to the worst case of approach one, however can take advantage of auto parallelisation/simd
+// to perform fast, particularly on larger datasets.
+// This basically just involves smushing all the rules, then doing a matrix multiple and matrix addition
+// on the initial set. Can't record passes, but can record the smushed rules if only the data changes later
+// Advantage of this approach is it can be easily extended to run on the gpu.
+pub fn move_money_1() {}
+
+// Approach 2:
+// Traditinoal/naive, total for each department is stored in an initial map (department -> total amount).
+// Another map is built up for each rule, and each rule is processed based on the amount in the current total
+// map.
+// Upon a pass break (separator), the temp map will assign the values into the total map.
+// Once done, do a final assignment back to the total, and return that.
+// Advantage of this is the required code is tiny, and no third-party math library is required.
+// Note that the movement happens on a line-by-line level. So we can stream the data from disk, and potentially apply this
+// to every. It's also much more memory efficient than approach 1.
+// TODO: Time both approaches to seee which is faster depending on the size of the input data/number of rules
+pub fn move_money_2(
+    initial_totals: HashMap<Unit, f64>,
+    rules: &Vec<MovementRule>,
+) -> HashMap<Unit, f64> {
+    // Note: It's potentially a bit more intensive to use cloned totals (rather than just update temp_total per rule),
+    // but it's much simpler code and, and since we're only working line-by-line, it isn't really that much memory in practice
+    let mut running_total = HashMap::from(initial_totals);
+    let mut temp_total = running_total.clone();
+    for rule in rules {
+        if rule.is_separator {
+            running_total = temp_total.clone();
+        } else {
+            let mut sum_from = 0.;
+            for unit in &running_total {
+                if (rule.all_from_departments || rule.from_departments.contains(&unit.0.department))
+                    && (rule.all_from_accounts || rule.from_accounts.contains(&unit.0.account))
+                {
+                    let previous_temp = unit.1;
+                    let added_amount = if rule.is_percent {
+                        previous_temp * rule.amount
+                    } else {
+                        rule.amount
+                    };
+                    sum_from += added_amount;
+                    *temp_total.get_mut(&unit.0).unwrap() -= added_amount;
+                }
+            }
+
+            let num_to_units = running_total
+                .keys()
+                .filter(|key| {
+                    (rule.all_to_accounts || rule.to_departments.contains(&key.department))
+                        && (rule.all_to_accounts || rule.to_accounts.contains(&key.account))
+                })
+                .count();
+            let value_per_unit = sum_from / num_to_units as f64;
+            for unit in running_total.keys() {
+                *temp_total.get_mut(&unit).unwrap() += value_per_unit;
+            }
+        }
+    }
+    temp_total
+}