Skip to content

Commit 77ad194

Browse files
committed
[FSTORE-1633] Fix engine choice in case of connection to serverless
1 parent 2515818 commit 77ad194

File tree

1 file changed

+23
-19
lines changed

1 file changed

+23
-19
lines changed

python/hopsworks_common/connection.py

+23-19
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import weakref
2525
from typing import Any, Optional
2626

27-
from hopsworks_common import client, usage, util, version
27+
from hopsworks_common import client, constants, usage, util, version
2828
from hopsworks_common.core import (
2929
hosts_api,
3030
project_api,
@@ -98,13 +98,12 @@ class Connection:
9898
project: The name of the project to connect to. When running on Hopsworks, this
9999
defaults to the project from where the client is run from.
100100
Defaults to `None`.
101-
engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
102-
which initializes the engine to Spark if the environment provides Spark, for
103-
example on Hopsworks and Databricks, or falls back to Python if Spark is not
104-
available, e.g. on local Python environments or AWS SageMaker. This option
105-
allows you to override this behaviour. `"training"` engine is useful when only
106-
feature store metadata is needed, for example training dataset location and label
107-
information when Hopsworks training experiment is conducted.
101+
engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment:
102+
"spark": Used if Spark is available and the connection is not to serverless Hopsworks, such as in Hopsworks or Databricks environments.
103+
"python": Used in local Python environments or AWS SageMaker when Spark is not available or the connection is done to serverless Hopsworks.
104+
"training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments.
105+
"spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore.
106+
"spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS.
108107
hostname_verification: Whether or not to verify Hopsworks' certificate, defaults
109108
to `True`.
110109
trust_store_path: Path on the file system containing the Hopsworks certificates,
@@ -338,30 +337,35 @@ def connect(self) -> None:
338337
self._connected = True
339338
finalizer = weakref.finalize(self, self.close)
340339
try:
340+
external = client.base.Client.REST_ENDPOINT not in os.environ
341+
serverless = self._host == constants.HOSTS.APP_HOST
341342
# determine engine, needed to init client
342-
if (self._engine is not None and self._engine.lower() == "spark") or (
343-
self._engine is None and importlib.util.find_spec("pyspark")
343+
if (
344+
self._engine is None
345+
and importlib.util.find_spec("pyspark")
346+
and (not external or not serverless)
344347
):
345348
self._engine = "spark"
346-
elif (self._engine is not None and self._engine.lower() == "python") or (
347-
self._engine is None and not importlib.util.find_spec("pyspark")
348-
):
349+
elif self._engine is None:
350+
self._engine = "python"
351+
elif self._engine.lower() == "spark":
352+
self._engine = "spark"
353+
elif self._engine.lower() == "python":
349354
self._engine = "python"
350-
elif self._engine is not None and self._engine.lower() == "training":
355+
elif self._engine.lower() == "training":
351356
self._engine = "training"
352-
elif (
353-
self._engine is not None
354-
and self._engine.lower() == "spark-no-metastore"
355-
):
357+
elif self._engine.lower() == "spark-no-metastore":
356358
self._engine = "spark-no-metastore"
359+
elif self._engine.lower() == "spark-delta":
360+
self._engine = "spark-delta"
357361
else:
358362
raise ConnectionError(
359363
"Engine you are trying to initialize is unknown. "
360364
"Supported engines are `'spark'`, `'python'` and `'training'`."
361365
)
362366

363367
# init client
364-
if client.base.Client.REST_ENDPOINT not in os.environ:
368+
if external:
365369
client.init(
366370
"external",
367371
self._host,

0 commit comments

Comments
 (0)