Files
ingey/src/lib.rs
2023-01-28 10:51:31 +10:30

270 lines
11 KiB
Rust

extern crate nalgebra as na;
use itertools::Itertools;
use na::{DMatrix, Dynamic, LU};
use std::{collections::HashMap, error::Error, ops::Mul};
// TODO: Look into serde for serialisation, can also use it to serialise/deserialise
// records from a csv file using the csv crate
#[derive(Default)]
pub struct MovementRule {
// If the vectors are empty, then it means 'all'
pub from_units: Vec<String>,
pub to_units: Vec<String>,
pub amount: f64,
pub is_percent: bool,
pub is_separator: bool,
}
impl MovementRule {
pub fn pass_break() -> MovementRule {
MovementRule {
from_units: vec![],
to_units: vec![],
amount: 0.0,
is_percent: false,
is_separator: true,
}
}
pub fn validate(&self) -> bool {
if self.from_units.is_empty() && self.to_units.is_empty() {
// Would be nice to have a decent message/error here as well
return false;
}
if self.is_percent && (self.amount < 0.0 || self.amount > 100.0) {
return false;
}
true
}
}
// Rules get parsed from file, converted into matrix format (for the in-memory movement),
// then combined (smushed) into a single matrix + vector/rule for each . The list of units can then have the rules applied
//
// For now just ignore the all from/to stuff, it's kind of a shit thing to do, and can
// be worked around by actually inputting every type into the rules
pub fn smush_rules(rules: Vec<MovementRule>) -> Vec<MovementRule> {
let ruleMapping: HashMap<String, usize> = HashMap::new();
// First build out the list/map of all departments (store index of each element in the array)
// TODO: We could make this more advanced by only smushing per divider, so that only the departments
// needed between each pass is actually required
for rule in rules {
for department in rule.from_units {
// ruleMapping.entry(department).or_insert(ruleMapping.len());
}
}
vec![]
}
// Approach 1:
// Use math (linear algebra) to move between departments. Memory/computationally it's equivalent
// to the worst case of approach one, however can take advantage of auto parallelisation/simd
// to perform fast, particularly on larger datasets.
// This basically just involves smushing all the rules, then doing a matrix multiple and matrix addition
// on the initial set. Can't record passes, but can record the smushed rules if only the data changes later
// Advantage of this approach is it can be easily extended to run on the gpu.
pub fn move_money_1() {}
// Approach 2:
// Traditinoal/naive, total for each department is stored in an initial map (department -> total amount).
// Another map is built up for each rule, and each rule is processed based on the amount in the current total
// map.
// Upon a pass break (separator), the temp map will assign the values into the total map.
// Once done, do a final assignment back to the total, and return that.
// Advantage of this is the required code is tiny, and no third-party math library is required.
// Note that the movement happens on a line-by-line level. So we can stream the data from disk, and potentially apply this
// to every. It's also much more memory efficient than approach 1.
// TODO: Time both approaches to seee which is faster depending on the size of the input data/number of rules
// TODO: Right now this only supports movements between departments, we also need to support movements between accounts.
// This would require an expansion so that we also have from/to accounts, and the hashmap will use some struct
// that combines an account/department, which is also how the totals will be loaded. (so when loading from disk,
// we load the whole GL into memory sum the account/department totals, and move these into a map line by line)
pub fn move_money_2(
initial_totals: HashMap<String, f64>,
rules: Vec<MovementRule>,
) -> HashMap<String, f64> {
// TODO: Should probably validate that all the rules have departments that actually exist in initial_totals.
// Note: It's potentially a bit more intensive to use cloned totals, but it's much simpler code and, and since we're only working line-by-line
// it isn't really that much memory. in practice
let mut running_total = HashMap::from(initial_totals);
let mut temp_total: HashMap<String, f64> = running_total.clone();
for rule in rules {
if rule.is_separator {
running_total = temp_total.clone();
} else {
let mut sum_from = 0.;
for department in rule.from_units {
let previous_temp = running_total.get(&department).expect(
"Failed to find department in temp totals, this should not be possible",
);
let added_amount = if rule.is_percent {
*previous_temp * rule.amount
} else {
rule.amount
};
sum_from += added_amount;
*temp_total.get_mut(&department).unwrap() -= added_amount;
}
let value_per_unit = sum_from / rule.to_units.len() as f64;
for department in rule.to_units {
*temp_total.get_mut(&department).unwrap() += value_per_unit;
}
}
}
temp_total
}
#[derive(Debug, PartialEq, Eq)]
pub enum DepartmentType {
Operating,
Overhead,
}
// TODO: Could also look at BigDecimal rather than f64 for higher precision (even i64 might be fine if we don't need to divide...)
// Note: remember these are overhead departments only when calculating the lu decomposition or pseudoinverse, and for each department,
// you either need -1 or rest negative for a row to subtract the initial amounts so we end up effectively 0 (simultaneous equations end
// up with negative there so yes this is expected)
// Also, we could potentially use this same struct for non-overhead departments when mapping from overhead to
pub struct OverheadAllocationRule {
from_overhead_department: String,
to_department: String,
percent: f64,
to_department_type: DepartmentType,
}
pub struct TotalDepartmentCost {
department: String,
value: f64,
}
pub struct AccountCost {
account: String,
summed_department_costs: Vec<TotalDepartmentCost>,
}
// TODO: Also need a way to dictate the order of the departments?
pub trait ReciprocalAllocationSolver {
fn solve(&self, costs: &DMatrix<f64>) -> DMatrix<f64>;
}
impl ReciprocalAllocationSolver for LU<f64, Dynamic, Dynamic> {
fn solve(&self, costs: &DMatrix<f64>) -> DMatrix<f64> {
self.solve(costs).unwrap()
}
}
impl ReciprocalAllocationSolver for DMatrix<f64> {
fn solve(&self, costs: &DMatrix<f64>) -> DMatrix<f64> {
self.mul(costs)
}
}
fn get_rules_indexes(
allocations: &Vec<OverheadAllocationRule>,
department_type: DepartmentType,
) -> HashMap<String, usize> {
allocations
.iter()
.filter(|allocation| allocation.to_department_type == department_type)
.flat_map(|department| {
[
department.from_overhead_department.clone(),
department.to_department.clone(),
]
})
.unique()
.enumerate()
.map(|(index, department)| (department, index))
.collect()
}
// Perform the reciprocal allocation (matrix) method to allocate servicing departments (indirect) costs
// to functional departments. Basically just a matrix solve, uses regression (moore-penrose pseudoinverse) when
// matrix is singular
pub fn reciprocal_allocation(
allocations: Vec<OverheadAllocationRule>,
account_costs: Vec<AccountCost>,
// TODO: Throw an appropriate error
) -> Result<Vec<AccountCost>, Box<dyn Error>> {
let overhead_department_mappings: HashMap<String, usize> =
get_rules_indexes(&allocations, DepartmentType::Overhead);
let operating_department_mappings: HashMap<String, usize> =
get_rules_indexes(&allocations, DepartmentType::Operating);
let mut slice_allocations =
vec![0.; overhead_department_mappings.len() * overhead_department_mappings.len()];
for allocation in allocations
.iter()
.filter(|allocation| allocation.to_department_type == DepartmentType::Overhead)
{
// TODO: Check if we need to flp this around
let from_index = overhead_department_mappings
.get(&allocation.from_overhead_department)
.unwrap();
let to_index = operating_department_mappings
.get(&allocation.to_department)
.unwrap();
let elem = &mut slice_allocations
[(*from_index) + (overhead_department_mappings.len() * (*to_index))];
*elem = allocation.percent;
}
// TODO: Also need ones along the diagonal, and negatives in some places...
let mat: DMatrix<f64> = DMatrix::from_row_slice(
overhead_department_mappings.len(),
overhead_department_mappings.len(),
&slice_allocations,
);
if mat.determinant() == 0. {
let pseudo_inverse = mat.svd(true, true).pseudo_inverse(0.000001);
do_solve_reciprocal(
pseudo_inverse.unwrap(),
account_costs,
overhead_department_mappings,
allocations,
)
} else {
do_solve_reciprocal(
mat.lu(),
account_costs,
overhead_department_mappings,
allocations,
)
}
}
fn do_solve_reciprocal<T: ReciprocalAllocationSolver>(
solver: T,
account_costs: Vec<AccountCost>,
department_mappings: HashMap<String, usize>,
allocations: Vec<OverheadAllocationRule>,
) -> Result<Vec<AccountCost>, Box<dyn Error>> {
// TODO: Could batch the accounts, although probably won't see to big a speed increase, compiler should help us out
for total_costs in account_costs {
let mut slice_costs = vec![0.; department_mappings.len()];
for cost in total_costs.summed_department_costs {
let elem = &mut slice_costs[*department_mappings.get(&cost.department).unwrap()];
*elem = cost.value;
}
let costs_vec: DMatrix<f64> =
DMatrix::from_row_slice(department_mappings.len(), 1, &slice_costs);
let calculated_overheads = solver.solve(&costs_vec);
// Calculation: operating_overhead_usage . calculated_overheads + initial_totals
// Where operating_overhead_usage is the direct mapping from overhead -> operating department, calculated overheads is the
// solved overheads usages after taking into account usage between departments, and initial_totals is the initial values
// for the operating departments.
}
// TODO: return something appropriate
Ok(vec![])
}