Delete split node. It won't be very useful in most circumstances, and complex use cases that involve row splitting can be served via wasm or sql nodes instead.

2025-02-11 07:58:55 +10:30
parent 9578bd6965
commit c8fd8734eb
2 changed files with 24 additions and 194 deletions
--- a/src/graph/mod.rs
+++ b/src/graph/mod.rs
@@ -1,15 +1,12 @@
 use std::{
    cmp::{min, Ordering},
    collections::{HashMap, HashSet},
-    sync::{
+    sync::{mpsc, Arc},
        mpsc, Arc,
    },
 };
 use chrono::Local;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use split::{SplitNode, SplitNodeRunner};
 use tokio::sync::mpsc::Sender;
 use crate::graph::derive::DeriveNodeRunner;
@@ -24,15 +21,14 @@ use {
 };
 mod derive;
 mod dynamic;
 mod filter;
 mod node;
 mod pull_from_db;
-mod split;
+mod reduction;
 mod sql;
 mod sql_rule;
 mod upload_to_db;
 mod dynamic;
 mod reduction;
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub enum NodeConfiguration {
@@ -44,7 +40,6 @@ pub enum NodeConfiguration {
    UploadNode(UploadNode),
    SQLNode(SQLNode),
    Dynamic(DynamicNode),
    SplitNode(SplitNode),
    ReductionNode(ReductionNode),
 }
@@ -147,8 +142,9 @@ fn get_runnable_node(node: Node) -> Box<dyn RunnableNode + Send> {
        NodeConfiguration::UploadNode(upload_node) => Box::new(UploadNodeRunner { upload_node }),
        NodeConfiguration::SQLNode(sql_node) => Box::new(SQLNodeRunner { sql_node }),
        NodeConfiguration::Dynamic(dynamic_node) => Box::new(DynamicNodeRunner { dynamic_node }),
-        NodeConfiguration::SplitNode(split_node) => Box::new(SplitNodeRunner { split_node }),
+        NodeConfiguration::ReductionNode(reduction_node) => {
-        NodeConfiguration::ReductionNode(reduction_node) => Box::new(ReductionNodeRunner { reduction_node }),
+            Box::new(ReductionNodeRunner { reduction_node })
        }
    }
 }
@@ -175,7 +171,11 @@ impl RunnableGraph {
        RunnableGraph { graph }
    }
-    pub async fn run_default_tasks<F>(&self, num_threads: usize, status_changed: F) -> anyhow::Result<()>
+    pub async fn run_default_tasks<F>(
        &self,
        num_threads: usize,
        status_changed: F,
    ) -> anyhow::Result<()>
    where
        F: Fn(i64, NodeStatus),
    {
@@ -183,7 +183,8 @@ impl RunnableGraph {
            num_threads,
            Box::new(|node| get_runnable_node(node)),
            status_changed,
-        ).await
+        )
        .await
    }
    pub async fn run<'a, F, StatusChanged>(
@@ -328,8 +329,14 @@ mod tests {
                        name: "Hello".to_owned(),
                        configuration: NodeConfiguration::FilterNode(super::FilterNode {
                            filters: vec![],
-                            input_data_source: DataSource { path: PathBuf::from(""), source_type: SourceType::CSV },
+                            input_data_source: DataSource {
-                            output_data_source: DataSource { path: PathBuf::from(""), source_type: SourceType::CSV },
+                                path: PathBuf::from(""),
                                source_type: SourceType::CSV,
                            },
                            output_data_source: DataSource {
                                path: PathBuf::from(""),
                                source_type: SourceType::CSV,
                            },
                        }),
                        output_files: vec![],
                        dynamic_configuration: None,
--- a/src/graph/split.rs
+++ b/src/graph/split.rs
@@ -1,177 +0,0 @@
 use async_trait::async_trait;
 use chrono::DateTime;
 // use polars::io::SerReader;
 // use polars::prelude::ParquetReader;
 use polars::{
    io::SerWriter,
    prelude::{CsvWriter, LazyCsvReader, LazyFileListReader},
 };
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
 use std::collections::BTreeMap;
 // use std::fs::File;
 use tempfile::tempfile;
 use crate::io::RecordSerializer;
 use super::{
    derive::{self, DataValidator, DeriveFilter},
    node::RunnableNode,
 };
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub enum DatePart {
    Year,
    Month,
    Week,
    Day,
    Hour,
    Minute,
    Second,
 }
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub enum SplitType {
    // Column, frequency
    DateTime(DatePart),
    Numeric(isize),
 }
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub struct SplitOnChangeInColumn {
    id_column: String,
    change_column: String,
    limit: Option<u64>,
 }
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub struct Split {
    column: String,
    split_type: SplitType,
    // If specified, a split will also be made when the change column changes for the id column
    change_in_column: Option<SplitOnChangeInColumn>,
 }
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub struct SplitRule {
    pub filters: Vec<DeriveFilter>,
    pub splits: Vec<Split>,
 }
 impl SplitRule {
    fn to_runnable_rule(&self) -> anyhow::Result<RunnableSplitRule> {
        let filters = derive::to_filter_rules(&self.filters)?;
        Ok(RunnableSplitRule {
            filters,
            splits: self.splits.clone(),
        })
    }
 }
 #[derive(Serialize, Deserialize, Clone, JsonSchema)]
 pub struct SplitNode {
    pub filters: Vec<DeriveFilter>,
    pub rules: Vec<SplitRule>,
    pub input_file_path: String,
    pub output_file_path: String,
 }
 pub struct RunnableSplitRule {
    pub filters: Vec<Box<dyn DataValidator>>,
    pub splits: Vec<Split>,
 }
 pub struct SplitNodeRunner {
    pub split_node: SplitNode,
 }
 fn split_line(
    line: BTreeMap<String, String>,
    rules: &Vec<RunnableSplitRule>,
    output: &mut impl RecordSerializer,
    date_format: &str,
    last_split_value: Option<(String, String)>,
 ) -> anyhow::Result<Option<(String, String)>> {
    let mut result_lines = vec![];
    for rule in rules {
        if !derive::is_line_valid(&line, &rule.filters) {
            continue;
        }
        for split in &rule.splits {
            let value = line.get(&split.column);
            if let Some(value) = value {
                // Parse the value in the column for the rule
                match &split.split_type {
                    SplitType::DateTime(frequency) => {
                        let date_time = DateTime::parse_from_str(&value, &date_format)?;
                        // TODO: Now split the row up based on the frequency in the rule
                    }
                    SplitType::Numeric(frequency) => {
                        // TODO: Just skip unparsable values, log out it's incorrect?
                        let number = value.parse::<f64>()?;
                    }
                }
            }
        }
    }
    result_lines.push(line);
    for line in result_lines {
        output.serialize(line)?;
    }
    Ok(None)
 }
 fn split(
    rules: &Vec<RunnableSplitRule>,
    input: &String,
    output: &mut impl RecordSerializer,
    date_format: &str,
 ) -> anyhow::Result<()> {
    // First sort the input file into the output file
    let mut temp_path = tempfile()?;
    // This needs to be done for each split rule with a change column specified
    // TODO: Add parquet support (both read and write)
    // let file = File::open(input)?;
    // let df = ParquetReader::new(file).finish()?;
    let df = LazyCsvReader::new(input).finish()?;
    // TODO: Needs sorting
    let df = df.sort(["", ""], Default::default());
    CsvWriter::new(&mut temp_path).finish(&mut df.collect()?)?;
    // Then read from the temporary file (since it's sorted) and do the standard split over each row
    let mut input = csv::Reader::from_reader(temp_path);
    if let Some(line) = input.deserialize().next() {
        let line: BTreeMap<String, String> = line?;
        output.write_header(&line)?;
        let mut last_split_value = split_line(line, rules, output, &date_format, None)?;
        for line in input.deserialize() {
            let line: BTreeMap<String, String> = line?;
            last_split_value = split_line(line, rules, output, &date_format, last_split_value)?;
        }
    }
    Ok(())
 }
 #[async_trait]
 impl RunnableNode for SplitNodeRunner {
    async fn run(&self) -> anyhow::Result<()> {
        let mut output = csv::Writer::from_path(&self.split_node.output_file_path)?;
        let rules: anyhow::Result<Vec<RunnableSplitRule>> = self
            .split_node
            .rules
            .iter()
            .map(|rule| rule.to_runnable_rule())
            .collect();
        let rules = rules?;
        split(
            &rules,
            &self.split_node.input_file_path,
            &mut output,
            "%Y-%m-%d %H-%M-%S",
        )
    }
 }