Skip to content

Commit 4d1c499

Browse files
committed
Remove dependency on numpy except from convert_to_default_dataframe
1 parent ff0dd4f commit 4d1c499

14 files changed

+131
-35
lines changed

python/hopsworks_common/core/constants.py

+18
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@
2020
# Avro
2121
HAS_FAST_AVRO: bool = importlib.util.find_spec("fastavro") is not None
2222
HAS_AVRO: bool = importlib.util.find_spec("avro") is not None
23+
avro_not_installed_message = (
24+
"Avro package not found. "
25+
"If you want to use avro with Hopsworks you can install the corresponding extra via "
26+
'`pip install "hopsworks[avro]"`. '
27+
"You can also install avro directly in your environment with `pip install fastavro` or `pip install avro`. "
28+
"You will need to restart your kernel if applicable."
29+
)
2330

2431
# Confluent Kafka
2532
HAS_CONFLUENT_KAFKA: bool = importlib.util.find_spec("confluent_kafka") is not None
@@ -30,6 +37,7 @@
3037
"You can also install confluent-kafka directly in your environment e.g `pip install confluent-kafka`. "
3138
"You will need to restart your kernel if applicable."
3239
)
40+
3341
# Data Validation / Great Expectations
3442
HAS_GREAT_EXPECTATIONS: bool = (
3543
importlib.util.find_spec("great_expectations") is not None
@@ -45,7 +53,17 @@
4553

4654
HAS_ARROW: bool = importlib.util.find_spec("pyarrow") is not None
4755
HAS_PANDAS: bool = importlib.util.find_spec("pandas") is not None
56+
57+
# NumPy
4858
HAS_NUMPY: bool = importlib.util.find_spec("numpy") is not None
59+
numpy_not_installed_message = (
60+
"Numpy package not found. "
61+
"If you want to use numpy with Hopsworks you can install the corresponding extra via "
62+
'`pip install "hopsworks[numpy]"`. '
63+
"You can also install numpy directly in your environment with `pip install numpy`. "
64+
"You will need to restart your kernel if applicable."
65+
)
66+
4967
HAS_POLARS: bool = importlib.util.find_spec("polars") is not None
5068

5169
# SQL packages

python/hsfs/builtin_transformations.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
# limitations under the License.
1515
#
1616

17-
import numpy as np
17+
import math
18+
1819
import pandas as pd
1920
from hsfs.hopsworks_udf import udf
2021
from hsfs.transformation_statistics import TransformationStatistics
@@ -49,7 +50,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie
4950
# Unknown categories not present in training dataset are encoded as -1.
5051
return pd.Series(
5152
[
52-
value_to_index.get(data, -1) if not pd.isna(data) else np.nan
53+
value_to_index.get(data, -1) if not pd.isna(data) else math.nan
5354
for data in feature
5455
]
5556
)

python/hsfs/constructor/query.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@
2121
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
2222

2323
import humps
24-
import numpy as np
2524
import pandas as pd
2625
from hopsworks_common.client.exceptions import FeatureStoreException
26+
from hopsworks_common.core.constants import HAS_NUMPY
2727
from hsfs import engine, storage_connector, util
2828
from hsfs import feature_group as fg_mod
2929
from hsfs.constructor import join
@@ -34,6 +34,10 @@
3434
from hsfs.feature import Feature
3535

3636

37+
if HAS_NUMPY:
38+
import numpy as np
39+
40+
3741
@typechecked
3842
class Query:
3943
ERROR_MESSAGE_FEATURE_AMBIGUOUS = (

python/hsfs/core/feature_view_engine.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
import warnings
2020
from typing import Any, Dict, List, Optional, TypeVar, Union
2121

22-
import numpy as np
2322
import pandas as pd
2423
from hopsworks_common import client
2524
from hopsworks_common.client.exceptions import FeatureStoreException
25+
from hopsworks_common.core.constants import HAS_NUMPY
2626
from hsfs import (
2727
engine,
2828
feature_group,
@@ -46,6 +46,10 @@
4646
from hsfs.training_dataset_split import TrainingDatasetSplit
4747

4848

49+
if HAS_NUMPY:
50+
import numpy as np
51+
52+
4953
class FeatureViewEngine:
5054
ENTITY_TYPE = "featureview"
5155
_TRAINING_DATA_API_PATH = "trainingdatasets"
@@ -1227,7 +1231,9 @@ def _get_feature_logging_data(
12271231
model_col_name=FeatureViewEngine._HSML_MODEL,
12281232
predictions=predictions,
12291233
training_dataset_version=training_dataset_version,
1230-
hsml_model=self.get_hsml_model_value(hsml_model) if hsml_model else None,
1234+
hsml_model=self.get_hsml_model_value(hsml_model)
1235+
if hsml_model
1236+
else None,
12311237
)
12321238
else:
12331239
return engine.get_instance().get_feature_logging_df(

python/hsfs/core/kafka_engine.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@
2020
from io import BytesIO
2121
from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Optional, Tuple, Union
2222

23-
import numpy as np
2423
import pandas as pd
2524
from hopsworks_common import client
25+
from hopsworks_common.core.constants import HAS_NUMPY
2626
from hsfs.core import storage_connector_api
2727
from hsfs.core.constants import HAS_AVRO, HAS_CONFLUENT_KAFKA, HAS_FAST_AVRO
2828
from tqdm import tqdm
2929

3030

31+
if HAS_NUMPY:
32+
import numpy as np
33+
3134
if HAS_CONFLUENT_KAFKA:
3235
from confluent_kafka import Consumer, KafkaError, Producer, TopicPartition
3336

@@ -202,7 +205,7 @@ def encode_row(complex_feature_writers, writer, row):
202205
if isinstance(row, dict):
203206
for k in row.keys():
204207
# for avro to be able to serialize them, they need to be python data types
205-
if isinstance(row[k], np.ndarray):
208+
if HAS_NUMPY and isinstance(row[k], np.ndarray):
206209
row[k] = row[k].tolist()
207210
if isinstance(row[k], pd.Timestamp):
208211
row[k] = row[k].to_pydatetime()

python/hsfs/core/vector_server.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,16 @@
2525

2626
import avro.io
2727
import avro.schema
28-
import numpy as np
2928
import pandas as pd
3029
import polars as pl
3130
from hopsworks_common import client
31+
from hopsworks_common.core.constants import (
32+
HAS_AVRO,
33+
HAS_FAST_AVRO,
34+
HAS_NUMPY,
35+
avro_not_installed_message,
36+
numpy_not_installed_message,
37+
)
3238
from hsfs import (
3339
feature_view,
3440
training_dataset,
@@ -48,14 +54,15 @@
4854
)
4955

5056

51-
HAS_FASTAVRO = False
52-
try:
53-
from fastavro import schemaless_reader
57+
if HAS_NUMPY:
58+
import numpy as np
5459

55-
HAS_FASTAVRO = True
56-
except ImportError:
60+
if HAS_FAST_AVRO:
61+
from fastavro import schemaless_reader
62+
elif HAS_AVRO:
5763
from avro.io import BinaryDecoder
5864

65+
5966
_logger = logging.getLogger(__name__)
6067

6168

@@ -803,6 +810,8 @@ def handle_feature_vector_return_type(
803810
return feature_vectorz
804811
elif return_type.lower() == "numpy" and not inference_helper:
805812
_logger.debug("Returning feature vector as numpy array")
813+
if not HAS_NUMPY:
814+
raise ModuleNotFoundError(numpy_not_installed_message)
806815
return np.array(feature_vectorz)
807816
# Only inference helper can return dict
808817
elif return_type.lower() == "dict" and inference_helper:
@@ -1076,7 +1085,7 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
10761085
_logger.debug(
10771086
f"Building complex feature decoders corresponding to {complex_feature_schemas}."
10781087
)
1079-
if HAS_FASTAVRO:
1088+
if HAS_FAST_AVRO:
10801089
_logger.debug("Using fastavro for deserialization.")
10811090
return {
10821091
f_name: (
@@ -1100,6 +1109,8 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
11001109
for (f_name, schema) in complex_feature_schemas.items()
11011110
}
11021111
else:
1112+
if not HAS_AVRO:
1113+
raise ModuleNotFoundError(avro_not_installed_message)
11031114
_logger.debug("Fast Avro not found, using avro for deserialization.")
11041115
return {
11051116
f_name: (

python/hsfs/engine/python.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050

5151
import boto3
5252
import hsfs
53-
import numpy as np
5453
import pandas as pd
5554
import polars as pl
5655
import pyarrow as pa
@@ -84,6 +83,7 @@
8483
HAS_AIOMYSQL,
8584
HAS_ARROW,
8685
HAS_GREAT_EXPECTATIONS,
86+
HAS_NUMPY,
8787
HAS_PANDAS,
8888
HAS_SQLALCHEMY,
8989
)
@@ -98,6 +98,9 @@
9898
if HAS_GREAT_EXPECTATIONS:
9999
import great_expectations
100100

101+
if HAS_NUMPY:
102+
import numpy as np
103+
101104
if HAS_ARROW:
102105
from hsfs.core.type_systems import PYARROW_HOPSWORKS_DTYPE_MAPPING
103106
if HAS_AIOMYSQL and HAS_SQLALCHEMY:
@@ -1416,11 +1419,13 @@ def _start_offline_materialization(offline_write_options: Dict[str, Any]) -> boo
14161419
def _convert_feature_log_to_df(feature_log, cols) -> pd.DataFrame:
14171420
if feature_log is None and cols:
14181421
return pd.DataFrame(columns=cols)
1419-
if not (
1420-
isinstance(feature_log, (list, np.ndarray, pd.DataFrame, pl.DataFrame))
1422+
if not (isinstance(feature_log, (list, pd.DataFrame, pl.DataFrame))) or (
1423+
HAS_NUMPY and isinstance(feature_log, np.ndarray)
14211424
):
14221425
raise ValueError(f"Type '{type(feature_log)}' not accepted")
1423-
if isinstance(feature_log, list) or isinstance(feature_log, np.ndarray):
1426+
if isinstance(feature_log, list) or (
1427+
HAS_NUMPY and isinstance(feature_log, np.ndarray)
1428+
):
14241429
Engine._validate_logging_list(feature_log, cols)
14251430
return pd.DataFrame(feature_log, columns=cols)
14261431
else:
@@ -1431,7 +1436,9 @@ def _convert_feature_log_to_df(feature_log, cols) -> pd.DataFrame:
14311436

14321437
@staticmethod
14331438
def _validate_logging_list(feature_log, cols):
1434-
if isinstance(feature_log[0], list) or isinstance(feature_log[0], np.ndarray):
1439+
if isinstance(feature_log[0], list) or (
1440+
HAS_NUMPY and isinstance(feature_log[0], np.ndarray)
1441+
):
14351442
provided_len = len(feature_log[0])
14361443
else:
14371444
provided_len = 1

python/hsfs/engine/spark.py

+33-5
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,19 @@
3131
from pyspark.rdd import RDD
3232
from pyspark.sql import DataFrame
3333

34-
import numpy as np
3534
import pandas as pd
3635
import tzlocal
36+
from hopsworks_common.core.constants import HAS_NUMPY
3737
from hsfs.constructor import query
3838

3939
# in case importing in %%local
4040
from hsfs.core.vector_db_client import VectorDbClient
4141

4242

43+
if HAS_NUMPY:
44+
import numpy as np
45+
46+
4347
try:
4448
import pyspark
4549
from pyspark import SparkFiles
@@ -258,9 +262,33 @@ def _return_dataframe_type(self, dataframe, dataframe_type):
258262

259263
def convert_to_default_dataframe(self, dataframe):
260264
if isinstance(dataframe, list):
261-
dataframe = np.array(dataframe)
262-
263-
if isinstance(dataframe, np.ndarray):
265+
#################### TODO TODO TODO TODO TODO ####################
266+
if HAS_NUMPY:
267+
dataframe = np.array(dataframe)
268+
else:
269+
try:
270+
dataframe[0][0]
271+
except TypeError:
272+
raise TypeError(
273+
"Cannot convert a list that has less than two dimensions to a dataframe."
274+
) from None
275+
ok = False
276+
try:
277+
dataframe[0][0][0]
278+
except TypeError:
279+
ok = True
280+
if not ok:
281+
raise TypeError(
282+
"Cannot convert a list that has more than two dimensions to a dataframe."
283+
) from None
284+
num_cols = len(dataframe[0])
285+
dataframe_dict = {}
286+
for n_col in list(range(num_cols)):
287+
col_name = "col_" + str(n_col)
288+
dataframe_dict[col_name] = dataframe[:, n_col]
289+
dataframe = pd.DataFrame(dataframe_dict)
290+
291+
if HAS_NUMPY and isinstance(dataframe, np.ndarray):
264292
if dataframe.ndim != 2:
265293
raise TypeError(
266294
"Cannot convert numpy array that do not have two dimensions to a dataframe. "
@@ -284,7 +312,7 @@ def convert_to_default_dataframe(self, dataframe):
284312
):
285313
# convert to utc timestamp
286314
dataframe_copy[c] = dataframe_copy[c].dt.tz_convert(None)
287-
if dataframe_copy[c].dtype == np.dtype("datetime64[ns]"):
315+
if HAS_NUMPY and dataframe_copy[c].dtype == np.dtype("datetime64[ns]"):
288316
# set the timezone to the client's timezone because that is
289317
# what spark expects.
290318
dataframe_copy[c] = dataframe_copy[c].dt.tz_localize(

python/hsfs/feature_group.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import warnings
2323
from datetime import date, datetime
2424
from typing import (
25-
TYPE_CHECKING,
2625
Any,
2726
Dict,
2827
List,
@@ -33,17 +32,13 @@
3332
Union,
3433
)
3534

36-
37-
if TYPE_CHECKING:
38-
import great_expectations
39-
4035
import avro.schema
4136
import hsfs.expectation_suite
4237
import humps
43-
import numpy as np
4438
import pandas as pd
4539
import polars as pl
4640
from hopsworks_common.client.exceptions import FeatureStoreException, RestAPIError
41+
from hopsworks_common.core.constants import HAS_NUMPY
4742
from hsfs import (
4843
engine,
4944
feature,
@@ -104,6 +99,9 @@
10499
if HAS_CONFLUENT_KAFKA:
105100
import confluent_kafka
106101

102+
if HAS_NUMPY:
103+
import numpy as np
104+
107105

108106
_logger = logging.getLogger(__name__)
109107

python/hsfs/feature_group_writer.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,16 @@
1717

1818
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
1919

20-
import numpy as np
2120
import pandas as pd
21+
from hopsworks_common.core.constants import HAS_NUMPY
2222
from hsfs.core.job import Job
2323
from hsfs.validation_report import ValidationReport
2424

2525

26+
if HAS_NUMPY:
27+
import numpy as np
28+
29+
2630
class FeatureGroupWriter:
2731
def __init__(self, feature_group):
2832
self._feature_group = feature_group

0 commit comments

Comments
 (0)