Refactor filters for derive node, stub derive node implementation, fix first line check in filter node

This commit is contained in:
2024-08-01 19:12:27 +09:30
parent 0f0d40c2a1
commit fbf74d5299
2 changed files with 195 additions and 103 deletions

View File

@@ -1,5 +1,12 @@
use std::{collections::BTreeMap, str::FromStr};
use serde::{Deserialize, Serialize};
use crate::{
io::{RecordDeserializer, RecordSerializer},
node::RunnableNode,
};
#[derive(Serialize, Deserialize, Clone)]
pub enum DeriveColumnType {
Column(String),
@@ -63,13 +70,178 @@ pub struct DeriveFilter {
pub value_type: ValueType,
}
pub enum Comparator<T: PartialOrd> {
Equal(T),
NotEqual(T),
GreaterThan(T),
LessThan(T),
In(Vec<T>),
NotIn(Vec<T>),
}
impl<T: PartialOrd> Comparator<T> {
pub fn is_valid(&self, value: T) -> bool {
match self {
Comparator::Equal(v) => value == *v,
Comparator::NotEqual(v) => value != *v,
Comparator::GreaterThan(v) => value > *v,
Comparator::LessThan(v) => value < *v,
Comparator::In(v) => v.contains(&value),
Comparator::NotIn(v) => !v.contains(&value),
}
}
}
pub trait FieldName {
// Name of the field this validator should work on
fn get_field_name(&self) -> String;
}
pub type DataValidators = Vec<Box<dyn DataValidator>>;
pub trait DataValidator: FieldName {
// Whether the given value is valid for the validator
fn is_valid(&self, s: &str) -> bool;
}
pub struct FilterRule<T: PartialOrd> {
pub column_name: String,
pub comparator: Comparator<T>,
}
impl<T: PartialOrd> FieldName for FilterRule<T> {
fn get_field_name(&self) -> String {
self.column_name.clone()
}
}
impl<T: FromStr + PartialOrd> DataValidator for FilterRule<T> {
fn is_valid(&self, s: &str) -> bool {
s.parse().map_or(false, |f| self.comparator.is_valid(f))
}
}
pub fn to_filter_rules(filters: &Vec<DeriveFilter>) -> anyhow::Result<Vec<Box<dyn DataValidator>>> {
filters
.iter()
// For some reason inlining to_filter_rules causes a compiler error, so leaving
// in a separate function (it is cleaner at least)
.map(|filter| to_filter_rule(filter))
.collect()
}
fn to_filter_rule(filter: &DeriveFilter) -> anyhow::Result<Box<dyn DataValidator>> {
let value = filter.match_value.clone();
match filter.value_type {
crate::derive::ValueType::String => Ok(Box::new(get_filter_rule(filter, value))),
crate::derive::ValueType::Integer => {
Ok(Box::new(get_filter_rule(filter, value.parse::<i64>()?)))
}
crate::derive::ValueType::Float => {
Ok(Box::new(get_filter_rule(filter, value.parse::<f64>()?)))
}
crate::derive::ValueType::Boolean => {
Ok(Box::new(get_filter_rule(filter, value.parse::<bool>()?)))
}
}
}
fn get_filter_rule<T: PartialOrd>(filter: &DeriveFilter, value: T) -> FilterRule<T> {
FilterRule {
column_name: filter.column_name.clone(),
comparator: match filter.comparator {
MatchComparisonType::Equal => Comparator::Equal(value),
MatchComparisonType::GreaterThan => Comparator::GreaterThan(value),
MatchComparisonType::LessThan => Comparator::LessThan(value),
MatchComparisonType::NotEqual => Comparator::NotEqual(value),
},
}
}
#[derive(Serialize, Deserialize, Clone)]
pub struct DeriveColumnOperation {
pub column_name: String,
pub operation: DeriveOperation,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct DeriveRule {
pub operations: Vec<DeriveOperation>,
pub operations: Vec<DeriveColumnOperation>,
pub filters: Vec<DeriveFilter>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct DeriveNode {
pub rules: Vec<DeriveRule>,
pub input_file_path: String,
pub output_file_path: String,
}
pub struct RunnableDeriveRule {
pub operations: Vec<DeriveColumnOperation>,
pub filters: Vec<Box<dyn DataValidator>>,
}
impl DeriveRule {
fn to_runnable_rule(&self) -> anyhow::Result<RunnableDeriveRule> {
let filters = to_filter_rules(&self.filters)?;
Ok(RunnableDeriveRule {
operations: self.operations.clone(),
filters,
})
}
}
fn derive(
rules: &Vec<RunnableDeriveRule>,
input: &mut impl RecordDeserializer,
output: &mut impl RecordSerializer,
) -> anyhow::Result<()> {
if let Some(line) = input.deserialize()? {
let line: BTreeMap<String, String> = line;
output.write_header(&line)?;
derive_line(line, rules, output)?;
while let Some(line) = input.deserialize()? {
let line: BTreeMap<String, String> = line;
derive_line(line, rules, output)?;
}
}
Ok(())
}
fn derive_line(
line: BTreeMap<String, String>,
rules: &Vec<RunnableDeriveRule>,
output: &mut impl RecordSerializer,
) -> anyhow::Result<()> {
for rule in rules {
// First check the filter works. If there are no filters, the rule applies to all rows
for filter in &rule.filters {}
// TODO: Split operations should be processed separately, after all the other operations have been applied
// Apply all operations individually, adding as a column to the record map
for operation in &rule.operations {}
}
// for line in line {
output.serialize(line)
// }
}
pub struct DeriveNodeRunner {
derive_node: DeriveNode,
}
impl RunnableNode for DeriveNodeRunner {
fn run(&self) -> anyhow::Result<()> {
let mut reader = csv::Reader::from_path(&self.derive_node.input_file_path)?;
let mut writer = csv::Writer::from_path(&self.derive_node.output_file_path)?;
let rules: anyhow::Result<Vec<RunnableDeriveRule>> = self
.derive_node
.rules
.iter()
.map(|rule| rule.to_runnable_rule())
.collect();
let rules = rules?;
derive(&rules, &mut reader, &mut writer)
}
}

View File

@@ -1,79 +1,15 @@
use std::{collections::BTreeMap, str::FromStr};
use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use crate::{
derive::{DeriveFilter, MatchComparisonType},
derive::{to_filter_rules, DataValidator, DataValidators, DeriveFilter},
io::{RecordDeserializer, RecordSerializer},
node::RunnableNode,
};
pub enum Comparator<T: PartialOrd> {
Equal(T),
NotEqual(T),
GreaterThan(T),
LessThan(T),
In(Vec<T>),
NotIn(Vec<T>),
}
impl<T: PartialOrd> Comparator<T> {
pub fn is_valid(&self, value: T) -> bool {
match self {
Comparator::Equal(v) => value == *v,
Comparator::NotEqual(v) => value != *v,
Comparator::GreaterThan(v) => value > *v,
Comparator::LessThan(v) => value < *v,
Comparator::In(v) => v.contains(&value),
Comparator::NotIn(v) => !v.contains(&value),
}
}
}
pub trait FieldName {
// Name of the field this validator should work on
fn get_field_name(&self) -> String;
}
pub trait DataValidator: FieldName {
// Whether the given value is valid for the validator
fn is_valid(&self, s: &str) -> bool;
}
pub struct FilterRule<T: PartialOrd> {
column_name: String,
comparator: Comparator<T>,
}
impl<T: PartialOrd> FieldName for FilterRule<T> {
fn get_field_name(&self) -> String {
self.column_name.clone()
}
}
impl<T: FromStr + PartialOrd> DataValidator for FilterRule<T> {
fn is_valid(&self, s: &str) -> bool {
s.parse().map_or(false, |f| self.comparator.is_valid(f))
}
}
/**
* Write all lines from the input file to the output file, skipping records
* that don't satisfy the filter criteria
*/
pub fn filter_file(
rules: &Vec<Box<dyn DataValidator>>,
input: &mut impl RecordDeserializer,
output: &mut impl RecordSerializer,
) -> anyhow::Result<()> {
if let Some(line) = input.deserialize()? {
let line: BTreeMap<String, String> = line;
output.write_header(&line)?;
output.write_record(&line)?;
while let Some(line) = input.deserialize()? {
let line: BTreeMap<String, String> = line;
if rules.iter().all(|rule| {
fn is_line_valid(line: &BTreeMap<String, String>, rules: &DataValidators) -> bool {
rules.iter().all(|rule| {
line.get(&rule.get_field_name()).map_or(true, |value| {
if value.trim().is_empty() {
true
@@ -81,7 +17,29 @@ pub fn filter_file(
rule.is_valid(value)
}
})
}) {
})
}
/**
* Write all lines from the input file to the output file, skipping records
* that don't satisfy the filter criteria
*/
pub fn filter_file(
rules: &DataValidators,
input: &mut impl RecordDeserializer,
output: &mut impl RecordSerializer,
) -> anyhow::Result<()> {
if let Some(line) = input.deserialize()? {
let line: BTreeMap<String, String> = line;
output.write_header(&line)?;
if (is_line_valid(&line, &rules)) {
output.write_record(&line)?;
}
while let Some(line) = input.deserialize()? {
let line: BTreeMap<String, String> = line;
if is_line_valid(&line, rules) {
output.write_record(&line)?;
}
}
@@ -97,45 +55,6 @@ pub struct FilterNode {
pub output_file_path: String,
}
impl FilterNode {
fn to_filter_rules(&self) -> anyhow::Result<Vec<Box<dyn DataValidator>>> {
self.filters
.iter()
// For some reason inlining to_filter_rules causes a compiler error, so leaving
// in a separate function (it is cleaner at least)
.map(|filter| to_filter_rule(filter))
.collect()
}
}
fn to_filter_rule(filter: &DeriveFilter) -> anyhow::Result<Box<dyn DataValidator>> {
let value = filter.match_value.clone();
match filter.value_type {
crate::derive::ValueType::String => Ok(Box::new(get_filter_rule(filter, value))),
crate::derive::ValueType::Integer => {
Ok(Box::new(get_filter_rule(filter, value.parse::<i64>()?)))
}
crate::derive::ValueType::Float => {
Ok(Box::new(get_filter_rule(filter, value.parse::<f64>()?)))
}
crate::derive::ValueType::Boolean => {
Ok(Box::new(get_filter_rule(filter, value.parse::<bool>()?)))
}
}
}
fn get_filter_rule<T: PartialOrd>(filter: &DeriveFilter, value: T) -> FilterRule<T> {
FilterRule {
column_name: filter.column_name.clone(),
comparator: match filter.comparator {
MatchComparisonType::Equal => Comparator::Equal(value),
MatchComparisonType::GreaterThan => Comparator::GreaterThan(value),
MatchComparisonType::LessThan => Comparator::LessThan(value),
MatchComparisonType::NotEqual => Comparator::NotEqual(value),
},
}
}
pub struct FilterNodeRunner {
pub filter_node: FilterNode,
}
@@ -144,14 +63,15 @@ impl RunnableNode for FilterNodeRunner {
fn run(&self) -> anyhow::Result<()> {
let mut reader = csv::Reader::from_path(&self.filter_node.input_file_path)?;
let mut writer = csv::Writer::from_path(&self.filter_node.output_file_path)?;
let rules = self.filter_node.to_filter_rules()?;
let rules = to_filter_rules(&self.filter_node.filters)?;
filter_file(&rules, &mut reader, &mut writer)
}
}
#[cfg(test)]
mod tests {
use crate::filter::FilterRule;
use crate::derive::{Comparator, FilterRule};
use super::filter_file;
@@ -183,7 +103,7 @@ Value3,Value4
filter_file(
&vec![Box::new(FilterRule {
column_name: "Column1".to_owned(),
comparator: crate::filter::Comparator::NotEqual("Value3".to_owned()),
comparator: Comparator::NotEqual("Value3".to_owned()),
})],
&mut reader,
&mut writer,