Refactor codebase into submodules

This commit is contained in:
Piv
2023-01-29 21:37:07 +10:30
parent ba279c8c9b
commit 10723efb57
4 changed files with 464 additions and 450 deletions

283
src/move_money.rs Normal file
View File

@@ -0,0 +1,283 @@
use std::collections::HashMap;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
// TODO: Fix up these names, check if all is actually integrated into the strings
#[derive(Debug, Deserialize)]
struct CsvMovementRule {
#[serde(rename = "FromCC")]
// Need strings to further split later
from_departments: String,
to_departments: String,
all_from_departments: bool,
all_to_departments: bool,
from_accounts: String,
to_accounts: String,
all_from_accounts: bool,
all_to_accounts: bool,
amount: f64,
is_percent: Option<bool>,
is_separator: Option<bool>,
}
#[derive(Default)]
pub struct MovementRule {
// If the vectors are empty, then it means 'all'
pub from_departments: Vec<String>,
pub to_departments: Vec<String>,
pub all_from_departments: bool,
pub all_to_departments: bool,
pub from_accounts: Vec<String>,
pub to_accounts: Vec<String>,
pub all_from_accounts: bool,
pub all_to_accounts: bool,
pub amount: f64,
pub is_percent: bool,
pub is_separator: bool,
}
impl MovementRule {
pub fn pass_break() -> MovementRule {
MovementRule {
is_separator: true,
..MovementRule::default()
}
}
pub fn validate(&self) -> bool {
if self.from_departments.is_empty() && self.to_departments.is_empty() {
// Would be nice to have a decent message/error here as well
return false;
}
if self.is_percent && (self.amount < 0.0 || self.amount > 100.0) {
return false;
}
true
}
}
#[derive(Hash, Clone, Default, PartialEq, Eq)]
pub struct Unit {
pub department: String,
pub account: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct CsvCost {
account: String,
department: String,
value: f64,
}
pub fn move_money<R, L, O>(
rules_reader: csv::Reader<R>,
lines_reader: csv::Reader<L>,
output: csv::Writer<O>,
use_numeric_accounts: bool,
) -> anyhow::Result<()>
where
R: std::io::Read,
L: std::io::Read,
O: std::io::Write,
{
let mut lines_reader = lines_reader;
let headers = lines_reader.headers()?;
let mut account_index = 0;
let mut department_index = 0;
for (index, field) in headers.iter().enumerate() {
if field.eq_ignore_ascii_case("account") {
account_index = index;
} else if field.eq_ignore_ascii_case("department") {
department_index = index;
}
}
let lines: HashMap<Unit, f64> = lines_reader
.records()
.map(|record| {
let record = record.unwrap();
let account = record.get(account_index).unwrap();
let department = record.get(department_index).unwrap();
let sum = record
.iter()
.enumerate()
.filter(|(i, _)| *i != account_index && *i != department_index)
.map(|(_, f)| f.parse::<f64>().unwrap())
.sum();
(
Unit {
account: account.into(),
department: department.into(),
},
sum,
)
})
.collect();
let all_accounts_sorted = if use_numeric_accounts {
lines
.keys()
.map(|key| key.account.clone().parse::<i32>().unwrap())
.sorted()
.map(|account| account.to_string())
.collect()
} else {
lines
.keys()
.map(|key| key.account.clone())
.sorted()
.collect()
};
let all_departments_sorted = lines
.keys()
.map(|key| key.department.clone())
.sorted()
.collect();
let mut rules_reader = rules_reader;
let mut rules: Vec<MovementRule> = vec![];
for result in rules_reader.deserialize() {
let movement_rule: CsvMovementRule = result?;
let from_accounts = extract_range(
movement_rule.from_accounts,
movement_rule.all_from_accounts,
&all_accounts_sorted,
);
let to_accounts = extract_range(
movement_rule.to_accounts,
movement_rule.all_to_accounts,
&all_accounts_sorted,
);
let from_departments = extract_range(
movement_rule.from_departments,
movement_rule.all_from_departments,
&all_departments_sorted,
);
let to_departments = extract_range(
movement_rule.to_departments,
movement_rule.all_to_departments,
&all_departments_sorted,
);
rules.push(MovementRule {
from_departments,
to_departments,
all_from_departments: movement_rule.all_from_departments,
all_to_departments: movement_rule.all_to_departments,
from_accounts,
to_accounts,
all_from_accounts: movement_rule.all_from_accounts,
all_to_accounts: movement_rule.all_to_accounts,
amount: movement_rule.amount,
is_percent: movement_rule.is_percent.unwrap_or(false),
is_separator: movement_rule.is_separator.unwrap_or(false),
})
}
// Then run move_money
let moved = move_money_2(lines, &rules);
let mut output = output;
// Ouput the list moved moneys
for money in moved {
output.serialize(CsvCost {
account: money.0.account,
department: money.0.department,
value: money.1,
})?;
}
Ok(())
}
fn extract_range(range: String, all: bool, options: &Vec<String>) -> Vec<String> {
if all {
return vec![];
}
let split_range: Vec<&str> = range.split("-").collect();
if split_range.len() == 1 {
return vec![range];
}
let start_index = options
.iter()
.enumerate()
.find(|option| option.1 == split_range[0])
.map(|start| start.0);
let end_index = options
.iter()
.enumerate()
.find(|option| option.1 == split_range[1])
.map(|end| end.0);
if let Some(start) = start_index {
if let Some(end) = end_index {
return Vec::from(&options[start..end + 1]);
} else {
return vec![options[start].clone()];
}
} else if let Some(end) = end_index {
return vec![options[end].clone()];
}
return vec![];
}
// Approach 1:
// Use math (linear algebra) to move between departments. Memory/computationally it's equivalent
// to the worst case of approach one, however can take advantage of auto parallelisation/simd
// to perform fast, particularly on larger datasets.
// This basically just involves smushing all the rules, then doing a matrix multiple and matrix addition
// on the initial set. Can't record passes, but can record the smushed rules if only the data changes later
// Advantage of this approach is it can be easily extended to run on the gpu.
pub fn move_money_1() {}
// Approach 2:
// Traditinoal/naive, total for each department is stored in an initial map (department -> total amount).
// Another map is built up for each rule, and each rule is processed based on the amount in the current total
// map.
// Upon a pass break (separator), the temp map will assign the values into the total map.
// Once done, do a final assignment back to the total, and return that.
// Advantage of this is the required code is tiny, and no third-party math library is required.
// Note that the movement happens on a line-by-line level. So we can stream the data from disk, and potentially apply this
// to every. It's also much more memory efficient than approach 1.
// TODO: Time both approaches to seee which is faster depending on the size of the input data/number of rules
pub fn move_money_2(
initial_totals: HashMap<Unit, f64>,
rules: &Vec<MovementRule>,
) -> HashMap<Unit, f64> {
// Note: It's potentially a bit more intensive to use cloned totals (rather than just update temp_total per rule),
// but it's much simpler code and, and since we're only working line-by-line, it isn't really that much memory in practice
let mut running_total = HashMap::from(initial_totals);
let mut temp_total = running_total.clone();
for rule in rules {
if rule.is_separator {
running_total = temp_total.clone();
} else {
let mut sum_from = 0.;
for unit in &running_total {
if (rule.all_from_departments || rule.from_departments.contains(&unit.0.department))
&& (rule.all_from_accounts || rule.from_accounts.contains(&unit.0.account))
{
let previous_temp = unit.1;
let added_amount = if rule.is_percent {
previous_temp * rule.amount
} else {
rule.amount
};
sum_from += added_amount;
*temp_total.get_mut(&unit.0).unwrap() -= added_amount;
}
}
let num_to_units = running_total
.keys()
.filter(|key| {
(rule.all_to_accounts || rule.to_departments.contains(&key.department))
&& (rule.all_to_accounts || rule.to_accounts.contains(&key.account))
})
.count();
let value_per_unit = sum_from / num_to_units as f64;
for unit in running_total.keys() {
*temp_total.get_mut(&unit).unwrap() += value_per_unit;
}
}
}
temp_total
}