Remove dependency on numpy except from convert_to_default_dataframe

aversey · aversey · commit 4d1c49985a17 · 2024-09-23T09:48:14.000+02:00
diff --git a/python/hopsworks_common/core/constants.py b/python/hopsworks_common/core/constants.py
@@ -20,6 +20,13 @@
 # Avro
 HAS_FAST_AVRO: bool = importlib.util.find_spec("fastavro") is not None
 HAS_AVRO: bool = importlib.util.find_spec("avro") is not None
+avro_not_installed_message = (
+    "Avro package not found. "
+    "If you want to use avro with Hopsworks you can install the corresponding extra via "
+    '`pip install "hopsworks[avro]"`. '
+    "You can also install avro directly in your environment with `pip install fastavro` or `pip install avro`. "
+    "You will need to restart your kernel if applicable."
+)
 
 # Confluent Kafka
 HAS_CONFLUENT_KAFKA: bool = importlib.util.find_spec("confluent_kafka") is not None
@@ -30,6 +37,7 @@
     "You can also install confluent-kafka directly in your environment e.g `pip install confluent-kafka`. "
     "You will need to restart your kernel if applicable."
 )
+
 # Data Validation / Great Expectations
 HAS_GREAT_EXPECTATIONS: bool = (
     importlib.util.find_spec("great_expectations") is not None
@@ -45,7 +53,17 @@
 
 HAS_ARROW: bool = importlib.util.find_spec("pyarrow") is not None
 HAS_PANDAS: bool = importlib.util.find_spec("pandas") is not None
+
+# NumPy
 HAS_NUMPY: bool = importlib.util.find_spec("numpy") is not None
+numpy_not_installed_message = (
+    "Numpy package not found. "
+    "If you want to use numpy with Hopsworks you can install the corresponding extra via "
+    '`pip install "hopsworks[numpy]"`. '
+    "You can also install numpy directly in your environment with `pip install numpy`. "
+    "You will need to restart your kernel if applicable."
+)
+
 HAS_POLARS: bool = importlib.util.find_spec("polars") is not None
 
 # SQL packages
diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py
@@ -14,7 +14,8 @@
 #   limitations under the License.
 #
 
-import numpy as np
+import math
+
 import pandas as pd
 from hsfs.hopsworks_udf import udf
 from hsfs.transformation_statistics import TransformationStatistics
@@ -49,7 +50,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie
     # Unknown categories not present in training dataset are encoded as -1.
     return pd.Series(
         [
-            value_to_index.get(data, -1) if not pd.isna(data) else np.nan
+            value_to_index.get(data, -1) if not pd.isna(data) else math.nan
             for data in feature
         ]
     )
diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py
@@ -21,9 +21,9 @@
 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
 
 import humps
-import numpy as np
 import pandas as pd
 from hopsworks_common.client.exceptions import FeatureStoreException
+from hopsworks_common.core.constants import HAS_NUMPY
 from hsfs import engine, storage_connector, util
 from hsfs import feature_group as fg_mod
 from hsfs.constructor import join
@@ -34,6 +34,10 @@
 from hsfs.feature import Feature
 
 
+if HAS_NUMPY:
+    import numpy as np
+
+
 @typechecked
 class Query:
     ERROR_MESSAGE_FEATURE_AMBIGUOUS = (
diff --git a/python/hsfs/core/feature_view_engine.py b/python/hsfs/core/feature_view_engine.py
@@ -19,10 +19,10 @@
 import warnings
 from typing import Any, Dict, List, Optional, TypeVar, Union
 
-import numpy as np
 import pandas as pd
 from hopsworks_common import client
 from hopsworks_common.client.exceptions import FeatureStoreException
+from hopsworks_common.core.constants import HAS_NUMPY
 from hsfs import (
     engine,
     feature_group,
@@ -46,6 +46,10 @@
 from hsfs.training_dataset_split import TrainingDatasetSplit
 
 
+if HAS_NUMPY:
+    import numpy as np
+
+
 class FeatureViewEngine:
     ENTITY_TYPE = "featureview"
     _TRAINING_DATA_API_PATH = "trainingdatasets"
@@ -1227,7 +1231,9 @@ def _get_feature_logging_data(
                 model_col_name=FeatureViewEngine._HSML_MODEL,
                 predictions=predictions,
                 training_dataset_version=training_dataset_version,
-                hsml_model=self.get_hsml_model_value(hsml_model) if hsml_model else None,
+                hsml_model=self.get_hsml_model_value(hsml_model)
+                if hsml_model
+                else None,
             )
         else:
             return engine.get_instance().get_feature_logging_df(
diff --git a/python/hsfs/core/kafka_engine.py b/python/hsfs/core/kafka_engine.py
@@ -20,14 +20,17 @@
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Optional, Tuple, Union
 
-import numpy as np
 import pandas as pd
 from hopsworks_common import client
+from hopsworks_common.core.constants import HAS_NUMPY
 from hsfs.core import storage_connector_api
 from hsfs.core.constants import HAS_AVRO, HAS_CONFLUENT_KAFKA, HAS_FAST_AVRO
 from tqdm import tqdm
 
 
+if HAS_NUMPY:
+    import numpy as np
+
 if HAS_CONFLUENT_KAFKA:
     from confluent_kafka import Consumer, KafkaError, Producer, TopicPartition
 
@@ -202,7 +205,7 @@ def encode_row(complex_feature_writers, writer, row):
     if isinstance(row, dict):
         for k in row.keys():
             # for avro to be able to serialize them, they need to be python data types
-            if isinstance(row[k], np.ndarray):
+            if HAS_NUMPY and isinstance(row[k], np.ndarray):
                 row[k] = row[k].tolist()
             if isinstance(row[k], pd.Timestamp):
                 row[k] = row[k].to_pydatetime()
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
@@ -25,10 +25,16 @@
 
 import avro.io
 import avro.schema
-import numpy as np
 import pandas as pd
 import polars as pl
 from hopsworks_common import client
+from hopsworks_common.core.constants import (
+    HAS_AVRO,
+    HAS_FAST_AVRO,
+    HAS_NUMPY,
+    avro_not_installed_message,
+    numpy_not_installed_message,
+)
 from hsfs import (
     feature_view,
     training_dataset,
@@ -48,14 +54,15 @@
 )
 
 
-HAS_FASTAVRO = False
-try:
-    from fastavro import schemaless_reader
+if HAS_NUMPY:
+    import numpy as np
 
-    HAS_FASTAVRO = True
-except ImportError:
+if HAS_FAST_AVRO:
+    from fastavro import schemaless_reader
+elif HAS_AVRO:
     from avro.io import BinaryDecoder
 
+
 _logger = logging.getLogger(__name__)
 
 
@@ -803,6 +810,8 @@ def handle_feature_vector_return_type(
             return feature_vectorz
         elif return_type.lower() == "numpy" and not inference_helper:
             _logger.debug("Returning feature vector as numpy array")
+            if not HAS_NUMPY:
+                raise ModuleNotFoundError(numpy_not_installed_message)
             return np.array(feature_vectorz)
         # Only inference helper can return dict
         elif return_type.lower() == "dict" and inference_helper:
@@ -1076,7 +1085,7 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
             _logger.debug(
                 f"Building complex feature decoders corresponding to {complex_feature_schemas}."
             )
-        if HAS_FASTAVRO:
+        if HAS_FAST_AVRO:
             _logger.debug("Using fastavro for deserialization.")
             return {
                 f_name: (
@@ -1100,6 +1109,8 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
                 for (f_name, schema) in complex_feature_schemas.items()
             }
         else:
+            if not HAS_AVRO:
+                raise ModuleNotFoundError(avro_not_installed_message)
             _logger.debug("Fast Avro not found, using avro for deserialization.")
             return {
                 f_name: (
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
@@ -50,7 +50,6 @@
 
 import boto3
 import hsfs
-import numpy as np
 import pandas as pd
 import polars as pl
 import pyarrow as pa
@@ -84,6 +83,7 @@
     HAS_AIOMYSQL,
     HAS_ARROW,
     HAS_GREAT_EXPECTATIONS,
+    HAS_NUMPY,
     HAS_PANDAS,
     HAS_SQLALCHEMY,
 )
@@ -98,6 +98,9 @@
 if HAS_GREAT_EXPECTATIONS:
     import great_expectations
 
+if HAS_NUMPY:
+    import numpy as np
+
 if HAS_ARROW:
     from hsfs.core.type_systems import PYARROW_HOPSWORKS_DTYPE_MAPPING
 if HAS_AIOMYSQL and HAS_SQLALCHEMY:
@@ -1416,11 +1419,13 @@ def _start_offline_materialization(offline_write_options: Dict[str, Any]) -> boo
     def _convert_feature_log_to_df(feature_log, cols) -> pd.DataFrame:
         if feature_log is None and cols:
             return pd.DataFrame(columns=cols)
-        if not (
-            isinstance(feature_log, (list, np.ndarray, pd.DataFrame, pl.DataFrame))
+        if not (isinstance(feature_log, (list, pd.DataFrame, pl.DataFrame))) or (
+            HAS_NUMPY and isinstance(feature_log, np.ndarray)
         ):
             raise ValueError(f"Type '{type(feature_log)}' not accepted")
-        if isinstance(feature_log, list) or isinstance(feature_log, np.ndarray):
+        if isinstance(feature_log, list) or (
+            HAS_NUMPY and isinstance(feature_log, np.ndarray)
+        ):
             Engine._validate_logging_list(feature_log, cols)
             return pd.DataFrame(feature_log, columns=cols)
         else:
@@ -1431,7 +1436,9 @@ def _convert_feature_log_to_df(feature_log, cols) -> pd.DataFrame:
 
     @staticmethod
     def _validate_logging_list(feature_log, cols):
-        if isinstance(feature_log[0], list) or isinstance(feature_log[0], np.ndarray):
+        if isinstance(feature_log[0], list) or (
+            HAS_NUMPY and isinstance(feature_log[0], np.ndarray)
+        ):
             provided_len = len(feature_log[0])
         else:
             provided_len = 1
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
@@ -31,15 +31,19 @@
     from pyspark.rdd import RDD
     from pyspark.sql import DataFrame
 
-import numpy as np
 import pandas as pd
 import tzlocal
+from hopsworks_common.core.constants import HAS_NUMPY
 from hsfs.constructor import query
 
 # in case importing in %%local
 from hsfs.core.vector_db_client import VectorDbClient
 
 
+if HAS_NUMPY:
+    import numpy as np
+
+
 try:
     import pyspark
     from pyspark import SparkFiles
@@ -258,9 +262,33 @@ def _return_dataframe_type(self, dataframe, dataframe_type):
 
     def convert_to_default_dataframe(self, dataframe):
         if isinstance(dataframe, list):
-            dataframe = np.array(dataframe)
-
-        if isinstance(dataframe, np.ndarray):
+            #################### TODO TODO TODO TODO TODO ####################
+            if HAS_NUMPY:
+                dataframe = np.array(dataframe)
+            else:
+                try:
+                    dataframe[0][0]
+                except TypeError:
+                    raise TypeError(
+                        "Cannot convert a list that has less than two dimensions to a dataframe."
+                    ) from None
+                ok = False
+                try:
+                    dataframe[0][0][0]
+                except TypeError:
+                    ok = True
+                if not ok:
+                    raise TypeError(
+                        "Cannot convert a list that has more than two dimensions to a dataframe."
+                    ) from None
+                num_cols = len(dataframe[0])
+                dataframe_dict = {}
+                for n_col in list(range(num_cols)):
+                    col_name = "col_" + str(n_col)
+                    dataframe_dict[col_name] = dataframe[:, n_col]
+                dataframe = pd.DataFrame(dataframe_dict)
+
+        if HAS_NUMPY and isinstance(dataframe, np.ndarray):
             if dataframe.ndim != 2:
                 raise TypeError(
                     "Cannot convert numpy array that do not have two dimensions to a dataframe. "
@@ -284,7 +312,7 @@ def convert_to_default_dataframe(self, dataframe):
                 ):
                     # convert to utc timestamp
                     dataframe_copy[c] = dataframe_copy[c].dt.tz_convert(None)
-                if dataframe_copy[c].dtype == np.dtype("datetime64[ns]"):
+                if HAS_NUMPY and dataframe_copy[c].dtype == np.dtype("datetime64[ns]"):
                     # set the timezone to the client's timezone because that is
                     # what spark expects.
                     dataframe_copy[c] = dataframe_copy[c].dt.tz_localize(
diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py
@@ -22,7 +22,6 @@
 import warnings
 from datetime import date, datetime
 from typing import (
-    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -33,17 +32,13 @@
     Union,
 )
 
-
-if TYPE_CHECKING:
-    import great_expectations
-
 import avro.schema
 import hsfs.expectation_suite
 import humps
-import numpy as np
 import pandas as pd
 import polars as pl
 from hopsworks_common.client.exceptions import FeatureStoreException, RestAPIError
+from hopsworks_common.core.constants import HAS_NUMPY
 from hsfs import (
     engine,
     feature,
@@ -104,6 +99,9 @@
 if HAS_CONFLUENT_KAFKA:
     import confluent_kafka
 
+if HAS_NUMPY:
+    import numpy as np
+
 
 _logger = logging.getLogger(__name__)
 
diff --git a/python/hsfs/feature_group_writer.py b/python/hsfs/feature_group_writer.py
@@ -17,12 +17,16 @@
 
 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
 
-import numpy as np
 import pandas as pd
+from hopsworks_common.core.constants import HAS_NUMPY
 from hsfs.core.job import Job
 from hsfs.validation_report import ValidationReport
 
 
+if HAS_NUMPY:
+    import numpy as np
+
+
 class FeatureGroupWriter:
     def __init__(self, feature_group):
         self._feature_group = feature_group
diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py
diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,8 @@`
`14`	`14`	`# limitations under the License.`
`15`	`15`	`#`
`16`	`16`
`17`		`-import numpy as np`
	`17`	`+import math`
	`18`	`+`
`18`	`19`	`import pandas as pd`
`19`	`20`	`from hsfs.hopsworks_udf import udf`
`20`	`21`	`from hsfs.transformation_statistics import TransformationStatistics`
`@@ -49,7 +50,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie`
`49`	`50`	`# Unknown categories not present in training dataset are encoded as -1.`
`50`	`51`	`return pd.Series(`
`51`	`52`	`[`
`52`		`- value_to_index.get(data, -1) if not pd.isna(data) else np.nan`
	`53`	`+ value_to_index.get(data, -1) if not pd.isna(data) else math.nan`
`53`	`54`	`for data in feature`
`54`	`55`	`]`
`55`	`56`	`)`