Address Manu's review

aversey · aversey · commit 7280c4462fc2 · 2024-10-03T10:55:38.000+02:00
diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py
@@ -23,8 +23,6 @@
 from io import BytesIO
 from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, Union
 
-import avro.io
-import avro.schema
 import pandas as pd
 from hopsworks_common import client
 from hopsworks_common.core.constants import (
@@ -60,7 +58,9 @@
 
 if HAS_FAST_AVRO:
     from fastavro import schemaless_reader
-elif HAS_AVRO:
+if HAS_AVRO:
+    import avro.io
+    import avro.schema
     from avro.io import BinaryDecoder
 
 if HAS_POLARS:
@@ -1072,6 +1072,9 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
             - deserialization of complex features from the online feature store
             - conversion of string or int timestamps to datetime objects
         """
+        if not HAS_AVRO:
+            raise ModuleNotFoundError(avro_not_installed_message)
+
         complex_feature_schemas = {
             f.name: avro.io.DatumReader(
                 avro.schema.parse(
@@ -1114,8 +1117,6 @@ def build_complex_feature_decoders(self) -> Dict[str, Callable]:
                 for (f_name, schema) in complex_feature_schemas.items()
             }
         else:
-            if not HAS_AVRO:
-                raise ModuleNotFoundError(avro_not_installed_message)
             _logger.debug("Fast Avro not found, using avro for deserialization.")
             return {
                 f_name: (
diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py
@@ -101,9 +101,6 @@
 if HAS_NUMPY:
     import numpy as np
 
-if HAS_PYARROW:
-    from hsfs.core.type_systems import PYARROW_HOPSWORKS_DTYPE_MAPPING
-
 if HAS_AIOMYSQL and HAS_SQLALCHEMY:
     from hsfs.core import util_sql
 
diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py
@@ -317,6 +317,14 @@ def convert_to_default_dataframe(self, dataframe):
             )
         )
 
+    @staticmethod
+    def utc_disguised_as_local(dt):
+        local_tz = tzlocal.get_localzone()
+        utc = timezone.utc
+        if not dt.tzinfo:
+            dt = dt.replace(tzinfo=utc)
+        return dt.astimezone(utc).replace(tzinfo=local_tz)
+
     def convert_list_to_spark_dataframe(self, dataframe):
         if HAS_NUMPY:
             return self.convert_numpy_to_spark_dataframe(np.array(dataframe))
@@ -342,7 +350,11 @@ def convert_list_to_spark_dataframe(self, dataframe):
                 c = "col_" + str(n_col)
                 dataframe_dict[c] = [dataframe[i][n_col] for i in range(len(dataframe))]
             return self.convert_pandas_to_spark_dataframe(pd.DataFrame(dataframe_dict))
-        # We have neither numpy nor pandas, so there is no need to transform timestamps
+        for i in range(len(dataframe)):
+            dataframe[i] = [
+                self.utc_disguised_as_local(d) if isinstance(d, datetime) else d
+                for d in dataframe[i]
+            ]
         return self._spark_session.createDataFrame(
             dataframe, ["col_" + str(n) for n in range(num_cols)]
         )
@@ -361,13 +373,12 @@ def convert_numpy_to_spark_dataframe(self, dataframe):
                 dataframe_dict[c] = dataframe[:, n_col]
             return self.convert_pandas_to_spark_dataframe(pd.DataFrame(dataframe_dict))
         # convert timestamps to current timezone
-        local_tz = tzlocal.get_localzone()
         for n_col in range(num_cols):
             if dataframe[:, n_col].dtype == np.dtype("datetime64[ns]"):
                 # set the timezone to the client's timezone because that is
                 # what spark expects.
-                dataframe[:, n_col] = dataframe[:, n_col].map(
-                    lambda d: local_tz.fromutc(d.item().astimezone(local_tz))
+                dataframe[:, n_col] = np.array(
+                    [self.utc_disguised_as_local(d.item()) for d in dataframe[:, n_col]]
                 )
         return self._spark_session.createDataFrame(
             dataframe.tolist(), ["col_" + str(n) for n in range(num_cols)]