Skip to content

Optimize BinaryExpr Evaluation with Short-Circuiting for AND/OR Operators #15648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 33 commits into from
Closed
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
03e02bd
Enhance short-circuit evaluation for binary expressions
kosiew Apr 9, 2025
fc49c34
refactor: rename test_check_short_circuit to test_get_short_circuit_r…
kosiew Apr 9, 2025
b5ab6f4
fix: enhance short-circuit evaluation logic in get_short_circuit_resu…
kosiew Apr 9, 2025
f5c83e2
feat: add debug logging for binary expression evaluation and short-ci…
kosiew Apr 9, 2025
7971035
fix: improve short-circuit evaluation logic in BinaryExpr to ensure R…
kosiew Apr 9, 2025
4fa7711
fix: restrict short-circuit evaluation to logical operators in get_sh…
kosiew Apr 9, 2025
25394d8
add more println!("==> ");
kosiew Apr 9, 2025
f533532
fix: remove duplicate data type checks for left and right operands in…
kosiew Apr 9, 2025
5b1641c
feat: add debug prints for dictionary values and keys in binary expre…
kosiew Apr 9, 2025
d416c72
Tests pass
kosiew Apr 9, 2025
ed6451f
fix: remove redundant short-circuit evaluation check in BinaryExpr an…
kosiew Apr 9, 2025
ae25207
refactor: remove unnecessary debug prints and streamline short-circui…
kosiew Apr 9, 2025
308d7b4
test: enhance short-circuit evaluation tests for nullable and scalar …
kosiew Apr 9, 2025
a62df47
refactor: enhance short-circuit evaluation strategy in BinaryExpr to …
kosiew Apr 9, 2025
17ca22b
Revert "refactor: enhance short-circuit evaluation strategy in Binary…
kosiew Apr 9, 2025
3fe3742
bench: add benchmark for OR operation with all false values in short-…
kosiew Apr 10, 2025
89fb9e3
Merge branch 'main' into short-and-enum
kosiew Apr 10, 2025
582b38a
refactor: add ShortCircuitStrategy enum to optimize short-circuit eva…
kosiew Apr 10, 2025
31fd5d8
refactor: simplify short-circuit evaluation logic in check_short_circ…
kosiew Apr 10, 2025
aef4153
datafusion_expr::lit as expr_lit
kosiew Apr 10, 2025
85c6b67
fix short_circuit/or/all_false benchmark
kosiew Apr 10, 2025
8b910e1
refactor: optimize short-circuit evaluation in check_short_circuit fu…
kosiew Apr 10, 2025
e2b9f77
refactor: add count_boolean_values helper function and optimize check…
kosiew Apr 10, 2025
b130b24
Revert "refactor: add count_boolean_values helper function and optimi…
kosiew Apr 10, 2025
9faaee3
add benchmark
kosiew Apr 10, 2025
3c4a5fd
Merge branch 'main' into short-and
kosiew Apr 10, 2025
c1987e3
refactor: improve short-circuit logic in BinaryExpr for logical opera…
kosiew Apr 10, 2025
982e847
Merge branch 'short-and-enum' into short-and
kosiew Apr 10, 2025
534c5a3
check main binary_op.rs
kosiew Apr 11, 2025
d96b50d
optimise evaluate
kosiew Apr 11, 2025
583095e
optimise evaluate 2
kosiew Apr 11, 2025
19de99c
refactor op:AND, lhs all false op:OR, lhs all true to be faster
kosiew Apr 11, 2025
8f3ef97
fix clippy warning
kosiew Apr 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
298 changes: 233 additions & 65 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,17 @@ impl PhysicalExpr for BinaryExpr {
fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
use arrow::compute::kernels::numeric::*;

// Evaluate left-hand side expression.
let lhs = self.left.evaluate(batch)?;

// Optimize for short-circuiting `Operator::And` or `Operator::Or` operations and return early.
if check_short_circuit(&lhs, &self.op) {
return Ok(lhs);
// Check if we can apply short-circuit evaluation.
match check_short_circuit(&lhs, &self.op) {
ShortCircuitStrategy::ReturnLeft => return Ok(lhs),
ShortCircuitStrategy::ReturnRight => {
let rhs = self.right.evaluate(batch)?;
return Ok(rhs);
}
ShortCircuitStrategy::None => {} // Continue if no short-circuit applies.
}

let rhs = self.right.evaluate(batch)?;
Expand Down Expand Up @@ -405,23 +411,19 @@ impl PhysicalExpr for BinaryExpr {

let result_type = self.data_type(input_schema)?;

// Attempt to use special kernels if one input is scalar and the other is an array
let scalar_result = match (&lhs, &rhs) {
(ColumnarValue::Array(array), ColumnarValue::Scalar(scalar)) => {
// if left is array and right is literal(not NULL) - use scalar operations
if scalar.is_null() {
None
} else {
self.evaluate_array_scalar(array, scalar.clone())?.map(|r| {
r.and_then(|a| to_result_type_array(&self.op, a, &result_type))
})
// If the left-hand side is an array and the right-hand side is a non-null scalar, try the optimized kernel.
if let (ColumnarValue::Array(array), ColumnarValue::Scalar(ref scalar)) =
(&lhs, &rhs)
{
if !scalar.is_null() {
if let Some(result_array) =
self.evaluate_array_scalar(array, scalar.clone())?
{
let final_array = result_array
.and_then(|a| to_result_type_array(&self.op, a, &result_type));
return final_array.map(ColumnarValue::Array);
Comment on lines +414 to +424
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a rewrite while trying to optimize this function.

}
}
(_, _) => None, // default to array implementation
};

if let Some(result) = scalar_result {
return result.map(ColumnarValue::Array);
}

// if both arrays or both literals - extract arrays and continue execution
Expand Down Expand Up @@ -811,58 +813,109 @@ impl BinaryExpr {
}
}

enum ShortCircuitStrategy {
ReturnLeft,
ReturnRight,
None,
}
/// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation based on the left-hand side (lhs) result.
///
/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes unnecessary:
/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be `false` regardless of rhs.
/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be `true` regardless of rhs.
///
/// Returns `true` if short-circuiting is possible, `false` otherwise.
///
/// Short-circuiting occurs under these circumstances:
/// - For `AND`:
/// - if LHS is all false => short-circuit → return LHS
/// - if LHS is all true => short-circuit → return RHS
/// - For `OR`:
/// - if LHS is all true => short-circuit → return LHS
/// - if LHS is all false => short-circuit → return RHS
/// # Arguments
/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
/// * `op` - The logical operator (`AND` or `OR`)
///
/// # Implementation Notes
/// 1. Only works with Boolean-typed arguments (other types automatically return `false`)
/// 1. Only works with Boolean-typed arguments (other types automatically return `None`)
/// 2. Handles both scalar values and array values
/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from arrow-rs.
/// `bool_or`/`bool_and` maybe a better choice too,for detailed discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
let data_type = arg.data_type();
match (data_type, op) {
(DataType::Boolean, Operator::And) => {
match arg {
ColumnarValue::Array(array) => {
if let Ok(array) = as_boolean_array(&array) {
return array.false_count() == array.len();
}
/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
fn check_short_circuit(lhs: &ColumnarValue, op: &Operator) -> ShortCircuitStrategy {
// Quick reject for non-logical operators
if !matches!(op, Operator::And | Operator::Or) {
return ShortCircuitStrategy::None;
}

// Non-boolean types can't be short-circuited
if lhs.data_type() != DataType::Boolean {
return ShortCircuitStrategy::None;
}

// For AND operations, we prioritize detecting all-false cases
// For OR operations, we prioritize detecting all-true cases
let is_and = matches!(op, Operator::And);

match lhs {
ColumnarValue::Array(array) => {
// Fast path for arrays - try to downcast to boolean array
if let Ok(bool_array) = as_boolean_array(&array) {
// Arrays with nulls can't be short-circuited
if bool_array.null_count() > 0 {
return ShortCircuitStrategy::None;
}
ColumnarValue::Scalar(scalar) => {
if let ScalarValue::Boolean(Some(value)) = scalar {
return !value;

let len = bool_array.len();
if len == 0 {
return ShortCircuitStrategy::None;
}

if is_and {
// For AND, prioritize checking for all-false (short circuit case)
// Uses optimized false_count() method provided by Arrow
let false_count = bool_array.false_count();

// Short circuit if all values are false
if false_count == len {
return ShortCircuitStrategy::ReturnLeft;
}

// If no false values, then all must be true
if false_count == 0 {
return ShortCircuitStrategy::ReturnRight;
}
} else {
// For OR, prioritize checking for all-true (short circuit case)
// Uses optimized true_count() method provided by Arrow
let true_count = bool_array.true_count();

// Short circuit if all values are true
if true_count == len {
return ShortCircuitStrategy::ReturnLeft;
}

// If no true values, then all must be false
if true_count == 0 {
return ShortCircuitStrategy::ReturnRight;
}
}
}
false
}
(DataType::Boolean, Operator::Or) => {
match arg {
ColumnarValue::Array(array) => {
if let Ok(array) = as_boolean_array(&array) {
return array.true_count() == array.len();
}
}
ColumnarValue::Scalar(scalar) => {
if let ScalarValue::Boolean(Some(value)) = scalar {
return *value;
}
ColumnarValue::Scalar(scalar) => {
// Fast path for scalar values
if let ScalarValue::Boolean(Some(value)) = scalar {
// Direct logic based on the value and operation type
let is_true = *value;

// Return Left for:
// - AND with false value
// - OR with true value
if (is_and && !is_true) || (!is_and && is_true) {
return ShortCircuitStrategy::ReturnLeft;
} else {
return ShortCircuitStrategy::ReturnRight;
}
}
false
}
_ => false,
}

// If we can't short-circuit, indicate that normal evaluation should continue
ShortCircuitStrategy::None
}

fn concat_elements(left: Arc<dyn Array>, right: Arc<dyn Array>) -> Result<ArrayRef> {
Expand Down Expand Up @@ -919,10 +972,14 @@ pub fn similar_to(
mod tests {
use super::*;
use crate::expressions::{col, lit, try_cast, Column, Literal};
use datafusion_expr::lit as expr_lit;

use datafusion_common::plan_datafusion_err;
use datafusion_physical_expr_common::physical_expr::fmt_sql;

use crate::planner::logical2physical;
use arrow::array::BooleanArray;
use datafusion_expr::col as logical_col;
/// Performs a binary operation, applying any type coercion necessary
fn binary_op(
left: Arc<dyn PhysicalExpr>,
Expand Down Expand Up @@ -4895,9 +4952,7 @@ mod tests {

#[test]
fn test_check_short_circuit() {
use crate::planner::logical2physical;
use datafusion_expr::col as logical_col;
use datafusion_expr::lit;
// Test with non-nullable arrays
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Int32, false),
Field::new("b", DataType::Int32, false),
Expand All @@ -4911,20 +4966,133 @@ mod tests {
.unwrap();

// op: AND left: all false
let left_expr = logical2physical(&logical_col("a").eq(lit(2)), &schema);
let left_expr = logical2physical(&logical_col("a").eq(expr_lit(2)), &schema);
let left_value = left_expr.evaluate(&batch).unwrap();
assert!(check_short_circuit(&left_value, &Operator::And));
assert!(matches!(
check_short_circuit(&left_value, &Operator::And),
ShortCircuitStrategy::ReturnLeft
));

// op: AND left: not all false
let left_expr = logical2physical(&logical_col("a").eq(lit(3)), &schema);
let left_expr = logical2physical(&logical_col("a").eq(expr_lit(3)), &schema);
let left_value = left_expr.evaluate(&batch).unwrap();
assert!(!check_short_circuit(&left_value, &Operator::And));
assert!(matches!(
check_short_circuit(&left_value, &Operator::And),
ShortCircuitStrategy::None
));

// op: OR left: all true
let left_expr = logical2physical(&logical_col("a").gt(lit(0)), &schema);
let left_expr = logical2physical(&logical_col("a").gt(expr_lit(0)), &schema);
let left_value = left_expr.evaluate(&batch).unwrap();
assert!(check_short_circuit(&left_value, &Operator::Or));
assert!(matches!(
check_short_circuit(&left_value, &Operator::Or),
ShortCircuitStrategy::ReturnLeft
));

// op: OR left: not all true
let left_expr = logical2physical(&logical_col("a").gt(lit(2)), &schema);
let left_expr: Arc<dyn PhysicalExpr> =
logical2physical(&logical_col("a").gt(expr_lit(2)), &schema);
let left_value = left_expr.evaluate(&batch).unwrap();
assert!(!check_short_circuit(&left_value, &Operator::Or));
assert!(matches!(
check_short_circuit(&left_value, &Operator::Or),
ShortCircuitStrategy::None
));

// Test with nullable arrays and null values
let schema_nullable = Arc::new(Schema::new(vec![
Field::new("c", DataType::Boolean, true),
Field::new("d", DataType::Boolean, true),
]));

// Create arrays with null values
let c_array = Arc::new(BooleanArray::from(vec![
Some(true),
Some(false),
None,
Some(true),
None,
])) as ArrayRef;
let d_array = Arc::new(BooleanArray::from(vec![
Some(false),
Some(true),
Some(false),
None,
Some(true),
])) as ArrayRef;

let batch_nullable = RecordBatch::try_new(
Arc::clone(&schema_nullable),
vec![Arc::clone(&c_array), Arc::clone(&d_array)],
)
.unwrap();

// Case: Mixed values with nulls - shouldn't short-circuit for AND
let mixed_nulls = logical2physical(&logical_col("c"), &schema_nullable);
let mixed_nulls_value = mixed_nulls.evaluate(&batch_nullable).unwrap();
assert!(matches!(
check_short_circuit(&mixed_nulls_value, &Operator::And),
ShortCircuitStrategy::None
));

// Case: Mixed values with nulls - shouldn't short-circuit for OR
assert!(matches!(
check_short_circuit(&mixed_nulls_value, &Operator::Or),
ShortCircuitStrategy::None
));

// Test with all nulls
let all_nulls = Arc::new(BooleanArray::from(vec![None, None, None])) as ArrayRef;
let null_batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new("e", DataType::Boolean, true)])),
vec![all_nulls],
)
.unwrap();

let null_expr = logical2physical(&logical_col("e"), &null_batch.schema());
let null_value = null_expr.evaluate(&null_batch).unwrap();

// All nulls shouldn't short-circuit for AND or OR
assert!(matches!(
check_short_circuit(&null_value, &Operator::And),
ShortCircuitStrategy::None
));
assert!(matches!(
check_short_circuit(&null_value, &Operator::Or),
ShortCircuitStrategy::None
));

// Test with scalar values
// Scalar true
let scalar_true = ColumnarValue::Scalar(ScalarValue::Boolean(Some(true)));
assert!(matches!(
check_short_circuit(&scalar_true, &Operator::Or),
ShortCircuitStrategy::ReturnLeft
)); // Should short-circuit OR
assert!(matches!(
check_short_circuit(&scalar_true, &Operator::And),
ShortCircuitStrategy::ReturnRight
)); // Should return the RHS for AND

// Scalar false
let scalar_false = ColumnarValue::Scalar(ScalarValue::Boolean(Some(false)));
assert!(matches!(
check_short_circuit(&scalar_false, &Operator::And),
ShortCircuitStrategy::ReturnLeft
)); // Should short-circuit AND
assert!(matches!(
check_short_circuit(&scalar_false, &Operator::Or),
ShortCircuitStrategy::ReturnRight
)); // Should return the RHS for OR

// Scalar null
let scalar_null = ColumnarValue::Scalar(ScalarValue::Boolean(None));
assert!(matches!(
check_short_circuit(&scalar_null, &Operator::And),
ShortCircuitStrategy::None
));
assert!(matches!(
check_short_circuit(&scalar_null, &Operator::Or),
ShortCircuitStrategy::None
));
}
}