Skip to content

Commit f1316fa

Browse files
authored
Add helper function dh_null_to_nan for explicit null conv of array elements by users (deephaven#5310)
* Add dh_nulls_to_nan for explicit conv by users * Add input check for public API func * Add tests * Respond to review comments and fix a bug * Make the default for type_promotion False
1 parent c2017ce commit f1316fa

File tree

4 files changed

+99
-49
lines changed

4 files changed

+99
-49
lines changed

engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java

+1
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ public void verifyArguments(Class<?>[] argTypes) {
286286
StringBuilder argTypesStr = new StringBuilder();
287287
for (int i = 0; i < argTypes.length; i++) {
288288
Class<?> argType = argTypes[i];
289+
argType = argType == boolean.class ? Boolean.class : argType;
289290

290291
// if there are more arguments than parameters, we'll need to consider the last parameter as a varargs
291292
// parameter. This is not ideal. We should look for a better way to handle this, i.e. a way to convey that

py/server/deephaven/jcompat.py

+44-27
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
""" This module provides Java compatibility support including convenience functions to create some widely used Java
66
data structures from corresponding Python ones in order to be able to call Java methods. """
77

8-
from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Tuple, Literal, Optional
8+
from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Optional
99

1010
import jpy
1111
import numpy as np
1212
import pandas as pd
1313

1414
from deephaven import dtypes, DHError
1515
from deephaven._wrapper import unwrap, wrap_j_object, JObjectWrapper
16-
from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP, _J_ARRAY_NP_TYPE_MAP
16+
from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP
1717

1818
_NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE
1919
_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility")
@@ -216,14 +216,8 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t
216216
dtype (DType): The dtype of the Java array
217217
j_array (jpy.JType): The Java array to convert
218218
conv_null (bool): If True, convert nulls to the null value for the dtype
219-
type_promotion (bool): Ignored when conv_null is False. When type_promotion is False, (1) input Java integer,
220-
boolean, or character arrays containing Deephaven nulls yield an exception, (2) input Java float or double
221-
arrays containing Deephaven nulls have null values converted to np.nan, and (3) input Java arrays without
222-
Deephaven nulls are converted to the target type. When type_promotion is True, (1) input Java integer,
223-
boolean, or character arrays containing Deephaven nulls are converted to np.float64 arrays and Deephaven
224-
null values are converted to np.nan, (2) input Java float or double arrays containing Deephaven nulls have
225-
null values converted to np.nan, and (3) input Java arrays without Deephaven nulls are converted to the
226-
target type. Defaults to False.
219+
type_promotion (bool): Ignored when conv_null is False. When conv_null is True, see the description for the same
220+
named parameter in dh_nulls_to_nan().
227221
228222
Returns:
229223
np.ndarray: The numpy array or None if the Java array is None
@@ -255,26 +249,49 @@ def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, t
255249
np_array = np.array(j_array, np.object_)
256250

257251
if conv_null:
258-
if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
259-
if dtype in (dtypes.float32, dtypes.float64):
260-
np_array = np.copy(np_array)
261-
np_array[np_array == dh_null] = np.nan
262-
else:
263-
if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection
264-
np_array = np.frombuffer(np_array, np.byte)
265-
266-
if any(np_array[np_array == dh_null]):
267-
if not type_promotion:
268-
raise DHError(f"Problem creating numpy array. Java {dtype} array contains Deephaven null values, but numpy {np_array.dtype} array does not support null values")
269-
np_array = np_array.astype(np.float64)
270-
np_array[np_array == dh_null] = np.nan
271-
else:
272-
if dtype is dtypes.bool_: # needs to change its type back to bool
273-
np_array = np.frombuffer(np_array, np.bool_)
274-
return np_array
252+
return dh_null_to_nan(np_array, type_promotion)
275253

276254
return np_array
277255

256+
def dh_null_to_nan(np_array: np.ndarray, type_promotion: bool = False) -> np.ndarray:
257+
"""Converts Deephaven primitive null values in the given numpy array to np.nan. No conversion is performed on
258+
non-primitive types.
259+
260+
Note, the input numpy array is modified in place if it is of a float or double type. If that's not a desired behavior,
261+
pass a copy of the array instead. For input arrays of other types, a new array is always returned.
262+
263+
Args:
264+
np_array (np.ndarray): The numpy array to convert
265+
type_promotion (bool): When False, integer, boolean, or character arrays will cause an exception to be raised.
266+
When True, integer, boolean, or character arrays are converted to new np.float64 arrays and Deephaven null
267+
values in them are converted to np.nan. Numpy arrays of float or double types are not affected by this flag
268+
and Deephaven nulls will always be converted to np.nan in place. Defaults to False.
269+
270+
Returns:
271+
np.ndarray: The numpy array with Deephaven nulls converted to np.nan.
272+
273+
Raises:
274+
DHError
275+
"""
276+
if not isinstance(np_array, np.ndarray):
277+
raise DHError(message="The given np_array argument is not a numpy array.")
278+
279+
dtype = dtypes.from_np_dtype(np_array.dtype)
280+
if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype):
281+
if dtype in (dtypes.float32, dtypes.float64):
282+
np_array = np.copy(np_array)
283+
np_array[np_array == dh_null] = np.nan
284+
else:
285+
if not type_promotion:
286+
raise DHError(message=f"failed to convert DH nulls to np.nan in the numpy array. The array is "
287+
f"of {np_array.dtype.type} type but type_promotion is False")
288+
if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection
289+
np_array = np.frombuffer(np_array, np.byte)
290+
291+
np_array = np_array.astype(np.float64)
292+
np_array[np_array == dh_null] = np.nan
293+
294+
return np_array
278295

279296
def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd.Series:
280297
"""Produce a copy of the specified Java array as a pandas.Series object.

py/server/tests/test_udf_array_args.py

+52-20
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numpy as np
99

1010
from deephaven import empty_table, DHError, dtypes
11+
from deephaven.jcompat import dh_null_to_nan
1112
from tests.testbase import BaseTestCase
1213
from .test_udf_scalar_args import _J_TYPE_NP_DTYPE_MAP, _J_TYPE_NULL_MAP, _J_TYPE_J_ARRAY_TYPE_MAP
1314

@@ -100,21 +101,8 @@ def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
100101
"""
101102
exec(func_str, globals())
102103

103-
# for floating point types, DH nulls are auto converted to np.nan
104-
# for integer types, DH nulls in the array raise exceptions
105-
if j_dtype in ("float", "double"):
106-
res = tbl.update("Z = test_udf(X, Y)")
107-
self.assertEqual(10, res.to_string().count("true"))
108-
else:
109-
res = tbl.update("Z = test_udf(X, Y)")
110-
self.assertEqual(10, res.to_string().count("true"))
111-
112-
# TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved
113-
# with self.assertRaises(DHError) as cm:
114-
# tbl.update("Z = test_udf(X, Y)")
115-
# self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values,
116-
# but numpy .* "
117-
# "array does not support ")
104+
res = tbl.update("Z = test_udf(X, Y)")
105+
self.assertEqual(10, res.to_string().count("true"))
118106

119107
def test_np_object_array(self):
120108
with self.subTest("PyObject"):
@@ -189,11 +177,6 @@ def test_udf(p1: np.ndarray[np.bool_], p2=None) -> bool:
189177
t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]).group_by("X")
190178
t1 = t.update(["X1 = test_udf(Y)"])
191179
self.assertEqual(t1.columns[2].data_type, dtypes.bool_)
192-
# TODO need to wait for https://github.com/deephaven/deephaven-core/issues/5213 to be resolved
193-
# with self.assertRaises(DHError) as cm:
194-
# t1 = t.update(["X1 = test_udf(Y)"])
195-
# self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, but numpy .* "
196-
# "array does not support ")
197180
t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]).group_by("X")
198181
t1 = t.update(["X1 = test_udf(Y)"])
199182
self.assertEqual(t1.columns[2].data_type, dtypes.bool_)
@@ -237,6 +220,55 @@ def test_udf(x, y: Union[{th}, np.ndarray[np.int64]]) -> bool:
237220
["Z = test_udf(X, Y.toArray())"])
238221
self.assertEqual(t.columns[2].data_type, dtypes.bool_)
239222

223+
def test_dh_null_conversion(self):
224+
x_formula = "X = i % 10"
225+
for j_dtype, null_name in _J_TYPE_NULL_MAP.items():
226+
y_formula = f"Y = i % 3 == 0? {null_name} : ({j_dtype})i"
227+
with self.subTest(j_dtype):
228+
tbl = empty_table(100).update([x_formula, y_formula]).group_by("X")
229+
230+
func_str = f"""
231+
def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
232+
z = dh_null_to_nan(y, type_promotion=True)
233+
check_y = (isinstance(x, int) and isinstance(y, np.ndarray) and y.dtype.type ==
234+
{_J_TYPE_NP_DTYPE_MAP[j_dtype]} and np.nanmean(y) == np.mean( y))
235+
check_z = np.any(np.isnan(z)) and (z.dtype.type == np.float64 if y.dtype.type not in {{np.float32, np.float64}}
236+
else z.dtype == y.dtype)
237+
return check_y and check_z
238+
"""
239+
exec(func_str, globals())
240+
241+
res = tbl.update("Z = test_udf(X, Y)")
242+
self.assertEqual(10, res.to_string().count("true"))
243+
244+
func_str = f"""
245+
def test_udf(x, y: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool:
246+
z = dh_null_to_nan(y, type_promotion=False)
247+
return True
248+
"""
249+
exec(func_str, globals())
250+
if j_dtype not in {"float", "double"}:
251+
with self.assertRaises(DHError) as cm:
252+
res = tbl.update("Z = test_udf(X, Y)")
253+
self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False")
254+
else:
255+
res = tbl.update("Z = test_udf(X, Y)")
256+
self.assertEqual(10, res.to_string().count("true"))
257+
258+
259+
with self.subTest("boolean"):
260+
def test_udf(p1: np.ndarray[np.bool_], p2=None, tp: bool = True) -> bool:
261+
z = dh_null_to_nan(p1, type_promotion=tp)
262+
return z.dtype.type == np.float64 and np.any(np.isnan(z))
263+
264+
t = empty_table(100).update(["X = i % 10", "Y = i % 3 == 0? true : null"]).group_by("X")
265+
rest = t.update(["X1 = test_udf(Y)"])
266+
self.assertEqual(10, res.to_string().count("true"))
267+
268+
with self.assertRaises(DHError) as cm:
269+
t.update(["X1 = test_udf(Y, null, false)"])
270+
self.assertRegex(str(cm.exception), "failed to convert DH nulls to np.nan .* type_promotion is False")
271+
240272

241273
if __name__ == "__main__":
242274
unittest.main()

py/server/tests/test_udf_scalar_args.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ def test_udf(p1: int, p2: float, kw1: str) -> bool:
408408

409409
with self.assertRaises(DHError) as cm:
410410
t = empty_table(1).update("X = `1`").update("Y = test_udf(1, 1.0, X = `1`)")
411-
self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got boolean")
411+
self.assertRegex(str(cm.exception), "test_udf: Expected argument .* got class java.lang.Boolean")
412412

413413
with self.subTest("with keyword only params"):
414414
def test_udf(p1: int, p2: float, *, kw1: str) -> bool:
@@ -538,7 +538,7 @@ def f6(*args: np.int32, col2: np.ndarray[np.int32]) -> bool:
538538

539539
with self.assertRaises(DHError) as cm:
540540
t1 = t.update(["X1 = f6(X, Y=null)"])
541-
self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got boolean")
541+
self.assertRegex(str(cm.exception), "f6: Expected argument \(col2\) to be either .* got class java.lang.Boolean")
542542

543543
with self.subTest("f7"):
544544
def f1(x: int) -> Optional[float]:

0 commit comments

Comments
 (0)