Add initial reduction node implementation, start adding dynamic node
All checks were successful
test / test (push) Successful in 15m58s
All checks were successful
test / test (push) Successful in 15m58s
This commit is contained in:
38
src/graph/dynamic.rs
Normal file
38
src/graph/dynamic.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
use crate::graph::node::RunnableNode;
|
||||
use async_trait::async_trait;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use wasmtime::component::{bindgen, Component};
|
||||
use wasmtime::{Config, Engine, Linker, Store};
|
||||
|
||||
bindgen!();
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
pub struct DynamicNode {
|
||||
pub wasm_file: String,
|
||||
pub file_paths: Vec<String>,
|
||||
pub output_file: String,
|
||||
}
|
||||
|
||||
pub struct DynamicNodeRunner {
|
||||
pub dynamic_node: DynamicNode,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl RunnableNode for DynamicNodeRunner {
|
||||
async fn run(&self) -> anyhow::Result<()> {
|
||||
let mut config = Config::new();
|
||||
config.wasm_component_model(true);
|
||||
let engine = Engine::new(&config)?;
|
||||
// let component = Component::from_file(&engine, self.dynamic_node.wasm_file.to_owned())?;
|
||||
// let mut linker = Linker::new(&engine);
|
||||
// ::add_to_linker(&mut linker, |state: &mut TestState| state)?;
|
||||
let mut store = Store::new(
|
||||
&engine,
|
||||
&self.dynamic_node,
|
||||
);
|
||||
// let (bindings, _) = Dynamic::instantiate(&mut store, &component, &linker)?;
|
||||
// bindings.call_greet(&mut store)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,8 @@ use split::{SplitNode, SplitNodeRunner};
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::graph::derive::DeriveNodeRunner;
|
||||
use crate::graph::dynamic::{DynamicNode, DynamicNodeRunner};
|
||||
use crate::graph::reduction::{ReductionNode, ReductionNodeRunner};
|
||||
use {
|
||||
derive::DeriveNode,
|
||||
filter::{FilterNode, FilterNodeRunner},
|
||||
@@ -29,10 +31,11 @@ mod split;
|
||||
mod sql;
|
||||
mod sql_rule;
|
||||
mod upload_to_db;
|
||||
mod dynamic;
|
||||
mod reduction;
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
pub enum NodeConfiguration {
|
||||
FileNode,
|
||||
MoveMoneyNode(MoveMoneyNode),
|
||||
MergeNode(MergeNode),
|
||||
DeriveNode(DeriveNode),
|
||||
@@ -40,8 +43,9 @@ pub enum NodeConfiguration {
|
||||
FilterNode(FilterNode),
|
||||
UploadNode(UploadNode),
|
||||
SQLNode(SQLNode),
|
||||
Dynamic,
|
||||
Dynamic(DynamicNode),
|
||||
SplitNode(SplitNode),
|
||||
ReductionNode(ReductionNode),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
@@ -135,7 +139,6 @@ impl Node {
|
||||
|
||||
fn get_runnable_node(node: Node) -> Box<dyn RunnableNode + Send> {
|
||||
match node.info.configuration {
|
||||
NodeConfiguration::FileNode => todo!(),
|
||||
NodeConfiguration::MoveMoneyNode(_) => todo!(),
|
||||
NodeConfiguration::MergeNode(_) => todo!(),
|
||||
NodeConfiguration::DeriveNode(derive_node) => Box::new(DeriveNodeRunner { derive_node }),
|
||||
@@ -143,8 +146,9 @@ fn get_runnable_node(node: Node) -> Box<dyn RunnableNode + Send> {
|
||||
NodeConfiguration::FilterNode(filter_node) => Box::new(FilterNodeRunner { filter_node }),
|
||||
NodeConfiguration::UploadNode(upload_node) => Box::new(UploadNodeRunner { upload_node }),
|
||||
NodeConfiguration::SQLNode(sql_node) => Box::new(SQLNodeRunner { sql_node }),
|
||||
NodeConfiguration::Dynamic => todo!(),
|
||||
NodeConfiguration::Dynamic(dynamic_node) => Box::new(DynamicNodeRunner { dynamic_node }),
|
||||
NodeConfiguration::SplitNode(split_node) => Box::new(SplitNodeRunner { split_node }),
|
||||
NodeConfiguration::ReductionNode(reduction_node) => Box::new(ReductionNodeRunner { reduction_node }),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
83
src/graph/reduction.rs
Normal file
83
src/graph/reduction.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
use std::fs::File;
|
||||
|
||||
use crate::graph::node::RunnableNode;
|
||||
use crate::io::DataSource;
|
||||
use async_trait::async_trait;
|
||||
use itertools::Itertools;
|
||||
use polars::io::SerWriter;
|
||||
use polars::prelude::{col, lit, CsvWriter, Expr, LazyCsvReader, LazyFileListReader};
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
|
||||
fn reduce(grouping_nodes: &Vec<String>, operations: &Vec<ReductionOperation>, input: &DataSource, output: &DataSource) -> anyhow::Result<()> {
|
||||
let df = LazyCsvReader::new(&input.path).finish()?;
|
||||
|
||||
let mut df = df
|
||||
.group_by(grouping_nodes.iter().map(|column| col(column)).collect_vec())
|
||||
.agg(&operations.iter().map(|operation| operation.to_aggregate_function()).collect_vec())
|
||||
.collect()?;
|
||||
let mut file = File::create(&output.path)?;
|
||||
CsvWriter::new(&mut file).finish(&mut df)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
pub enum ReductionOperationType {
|
||||
Sum,
|
||||
Multiply,
|
||||
Max,
|
||||
Min,
|
||||
Average,
|
||||
Count,
|
||||
Concat(ConcatProperties),
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
pub struct ConcatProperties {
|
||||
pub prefix: String,
|
||||
pub suffix: String,
|
||||
pub separator: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
pub struct ReductionOperation {
|
||||
pub column_name: String,
|
||||
pub reducer: ReductionOperationType,
|
||||
// TODO: Consider adding filtering on another column, so we can get sum of males grouping on some other column
|
||||
// TODO: Also consider adding an alias here
|
||||
}
|
||||
|
||||
impl ReductionOperation {
|
||||
fn to_aggregate_function(&self) -> Expr {
|
||||
match &self.reducer {
|
||||
ReductionOperationType::Sum => col(&self.column_name).sum(),
|
||||
// TODO: Check this, docs are a bit flaky on whether this'll work
|
||||
ReductionOperationType::Multiply => col(&self.column_name).product(),
|
||||
ReductionOperationType::Max => col(&self.column_name).max(),
|
||||
ReductionOperationType::Min => col(&self.column_name).min(),
|
||||
ReductionOperationType::Average => col(&self.column_name).mean(),
|
||||
ReductionOperationType::Count => col(&self.column_name).count(),
|
||||
ReductionOperationType::Concat(concat_properties) => lit(concat_properties.prefix.clone()).append(col(&self.column_name).list().join(lit(concat_properties.separator.clone()), true), false).append(lit(concat_properties.suffix.clone()), false),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, JsonSchema)]
|
||||
pub struct ReductionNode {
|
||||
pub grouping_columns: Vec<String>,
|
||||
pub operations: Vec<ReductionOperation>,
|
||||
pub input_file: DataSource,
|
||||
pub output_file: DataSource,
|
||||
}
|
||||
|
||||
pub struct ReductionNodeRunner {
|
||||
pub reduction_node: ReductionNode,
|
||||
}
|
||||
#[async_trait]
|
||||
impl RunnableNode for ReductionNodeRunner {
|
||||
async fn run(&self) -> anyhow::Result<()> {
|
||||
reduce(&self.reduction_node.grouping_columns, &self.reduction_node.operations, &self.reduction_node.input_file, &self.reduction_node.output_file)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user