Skip to content

Commit d692024

Browse files
bubriksAlexandru OrmenisanAlexandru Ormenisan
authored
[HWORKS-936] Explicit provenance - model (logicalclocks#220)
* [HWORKS-936] Explicit model provenance * model provenance * fix * fix __parse_training_datasets * add temp print * td use from_response_json_single --------- Co-authored-by: Alexandru Ormenisan <alex@Alexandrus-MBP.localdomain> Co-authored-by: Alexandru Ormenisan <alex@Alexandrus-MacBook-Pro.local>
1 parent 34de8ac commit d692024

File tree

15 files changed

+578
-0
lines changed

15 files changed

+578
-0
lines changed

Diff for: auto_doc.py

+8
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@
8484
"model_schema": ["hsml.model_schema.ModelSchema"],
8585
"model_schema_dict": ["hsml.model_schema.ModelSchema.to_dict"],
8686
},
87+
"model-registry/links.md": {
88+
"links_properties": keras_autodoc.get_properties(
89+
"hsml.core.explicit_provenance.Links"
90+
),
91+
"artifact_properties": keras_autodoc.get_properties(
92+
"hsml.core.explicit_provenance.Artifact"
93+
),
94+
},
8795
# Model Serving
8896
"model-serving/model_serving_api.md": {
8997
"ms_get": ["hsml.connection.Connection.get_model_serving"],

Diff for: docs/templates/model-registry/links.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Provenance Links
2+
3+
Provenance Links are objects returned by methods such as [get_feature_view_provenance](../model_api/#get_feature_view_provenance), [get_training_dataset_provenance](../model_api/#get_training_dataset_provenance). These methods use the provenance graph to return the parent feature view/training dataset of a model. These methods will return the actual instances of the feature view/training dataset if available. If the instance was deleted, or it belongs to a featurestore that the current project doesn't have access anymore, an Artifact object is returned.
4+
5+
There is an additional method using the provenance graph: [get_feature_view](../model_api/#get_feature_view). This method wraps the `get_feature_view_provenance` and always returns a correct, usable Feature View object or throws an exception if the returned object is an Artifact. Thus an exception is thrown if the feature view was deleted or the featurestore it belongs to was unshared.
6+
## Properties
7+
8+
{{links_properties}}
9+
10+
# Artifact
11+
12+
Artifacts objects are part of the provenance graph and contain a minimal set of information regarding the entities (feature views, training datasets) they represent.
13+
The provenance graph contains Artifact objects when the underlying entities have been deleted or they are corrupted or they are not accessible by the current project anymore.
14+
15+
{{artifact_properties}}

Diff for: python/hsml/core/explicit_provenance.py

+328
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
#
2+
# Copyright 2024 Hopsworks AB
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
import json
18+
from enum import Enum
19+
from typing import Set
20+
import humps
21+
22+
23+
class Artifact:
24+
class MetaType(Enum):
25+
DELETED = 1
26+
INACCESSIBLE = 2
27+
FAULTY = 3
28+
NOT_SUPPORTED = 4
29+
30+
def __init__(
31+
self,
32+
model_registry_id,
33+
name,
34+
version,
35+
type,
36+
meta_type,
37+
href=None,
38+
exception_cause=None,
39+
**kwargs,
40+
):
41+
self._model_registry_id = model_registry_id
42+
self._name = name
43+
self._version = version
44+
self._type = type
45+
self._meta_type = meta_type
46+
self._href = href
47+
self._exception_cause = exception_cause
48+
49+
@property
50+
def model_registry_id(self):
51+
"""Id of the model registry in which the artifact is located."""
52+
return self._model_registry_id
53+
54+
@property
55+
def name(self):
56+
"""Name of the artifact."""
57+
return self._name
58+
59+
@property
60+
def version(self):
61+
"""Version of the artifact"""
62+
return self._version
63+
64+
def __str__(self):
65+
return {
66+
"model_registry_id": self._model_registry_id,
67+
"name": self._name,
68+
"version": self._version,
69+
}
70+
71+
def __repr__(self):
72+
return (
73+
f"Artifact({self._model_registry_id!r}, {self._name!r}, "
74+
f"{self._version!r}, {self._type!r}, {self._meta_type!r}, "
75+
f"{self._href!r}, {self._exception_cause!r})"
76+
)
77+
78+
@staticmethod
79+
def from_response_json(json_dict: dict):
80+
link_json = humps.decamelize(json_dict)
81+
href = None
82+
exception_cause = None
83+
if link_json.get("exception_cause") is not None:
84+
meta_type = Artifact.MetaType.FAULTY
85+
exception_cause = link_json.get("exception_cause")
86+
elif bool(link_json["deleted"]):
87+
meta_type = Artifact.MetaType.DELETED
88+
elif not bool(link_json["accessible"]):
89+
meta_type = Artifact.MetaType.INACCESSIBLE
90+
href = link_json["artifact"]["href"]
91+
else:
92+
meta_type = Artifact.MetaType.NOT_SUPPORTED
93+
href = link_json["artifact"]["href"]
94+
return Artifact(
95+
link_json["artifact"]["project"],
96+
link_json["artifact"]["name"],
97+
link_json["artifact"]["version"],
98+
link_json["artifact_type"],
99+
meta_type,
100+
href=href,
101+
exception_cause=exception_cause,
102+
)
103+
104+
105+
class Links:
106+
def __init__(self):
107+
self._accessible = []
108+
self._deleted = []
109+
self._inaccessible = []
110+
self._faulty = []
111+
112+
@property
113+
def deleted(self):
114+
"""List of [Artifact objects] which contains
115+
minimal information (name, version) about the entities
116+
(feature views, training datasets) they represent.
117+
These entities have been removed from the feature store.
118+
"""
119+
return self._deleted
120+
121+
@property
122+
def inaccessible(self):
123+
"""List of [Artifact objects] which contains
124+
minimal information (name, version) about the entities
125+
(feature views, training datasets) they represent.
126+
These entities exist in the feature store, however the user
127+
does not have access to them anymore.
128+
"""
129+
return self._inaccessible
130+
131+
@property
132+
def accessible(self):
133+
"""List of [FeatureView|TrainingDataset objects] objects
134+
which are part of the provenance graph requested. These entities
135+
exist in the feature store and the user has access to them.
136+
"""
137+
return self._accessible
138+
139+
@property
140+
def faulty(self):
141+
"""List of [Artifact objects] which contains
142+
minimal information (name, version) about the entities
143+
(feature views, training datasets) they represent.
144+
These entities exist in the feature store, however they are corrupted.
145+
"""
146+
return self._faulty
147+
148+
class Direction(Enum):
149+
UPSTREAM = 1
150+
DOWNSTREAM = 2
151+
152+
class Type(Enum):
153+
FEATURE_VIEW = 1
154+
TRAINING_DATASET = 2
155+
156+
def __str__(self, indent=None):
157+
return json.dumps(self, cls=ProvenanceEncoder, indent=indent)
158+
159+
def __repr__(self):
160+
return (
161+
f"Links({self._accessible!r}, {self._deleted!r}"
162+
f", {self._inaccessible!r}, {self._faulty!r})"
163+
)
164+
165+
@staticmethod
166+
def __parse_feature_views(links_json: dict, artifacts: Set[str]):
167+
from hsfs import feature_view
168+
from hsfs.core import explicit_provenance as hsfs_explicit_provenance
169+
170+
links = Links()
171+
for link_json in links_json:
172+
if link_json["node"]["artifact_type"] in artifacts:
173+
if link_json["node"].get("exception_cause") is not None:
174+
links._faulty.append(
175+
hsfs_explicit_provenance.Artifact.from_response_json(
176+
link_json["node"]
177+
)
178+
)
179+
elif bool(link_json["node"]["accessible"]):
180+
fv = feature_view.FeatureView.from_response_json(
181+
link_json["node"]["artifact"]
182+
)
183+
links.accessible.append(fv)
184+
elif bool(link_json["node"]["deleted"]):
185+
links.deleted.append(
186+
hsfs_explicit_provenance.Artifact.from_response_json(
187+
link_json["node"]
188+
)
189+
)
190+
else:
191+
links.inaccessible.append(
192+
hsfs_explicit_provenance.Artifact.from_response_json(
193+
link_json["node"]
194+
)
195+
)
196+
else:
197+
new_links = Links.__parse_feature_views(
198+
link_json["upstream"], artifacts
199+
)
200+
links.faulty.extend(new_links.faulty)
201+
links.accessible.extend(new_links.accessible)
202+
links.inaccessible.extend(new_links.inaccessible)
203+
links.deleted.extend(new_links.deleted)
204+
return links
205+
206+
@staticmethod
207+
def __parse_training_datasets(links_json: dict, artifacts: Set[str]):
208+
from hsfs import training_dataset
209+
from hsfs.core import explicit_provenance as hsfs_explicit_provenance
210+
211+
links = Links()
212+
for link_json in links_json:
213+
if link_json["node"]["artifact_type"] in artifacts:
214+
if link_json["node"].get("exception_cause") is not None:
215+
links._faulty.append(
216+
hsfs_explicit_provenance.Artifact.from_response_json(
217+
link_json["node"]
218+
)
219+
)
220+
elif bool(link_json["node"]["accessible"]):
221+
td = training_dataset.TrainingDataset.from_response_json_single(
222+
link_json["node"]["artifact"]
223+
)
224+
links.accessible.append(td)
225+
elif bool(link_json["node"]["deleted"]):
226+
links.deleted.append(
227+
hsfs_explicit_provenance.Artifact.from_response_json(
228+
link_json["node"]
229+
)
230+
)
231+
else:
232+
links.inaccessible.append(
233+
hsfs_explicit_provenance.Artifact.from_response_json(
234+
link_json["node"]
235+
)
236+
)
237+
return links
238+
239+
@staticmethod
240+
def from_response_json(json_dict: dict, direction: Direction, artifact: Type):
241+
"""Parse explicit links from json response. There are three types of
242+
Links: UpstreamFeatureGroups, DownstreamFeatureGroups, DownstreamFeatureViews
243+
244+
# Arguments
245+
links_json: json response from the explicit provenance endpoint
246+
direction: subset of links to parse - UPSTREAM/DOWNSTREAM
247+
type: subset of links to parse - FEATURE_VIEW/TRAINING_DATASET/MODEL
248+
249+
# Returns
250+
A ProvenanceLink object for the selected parse type.
251+
"""
252+
253+
import importlib.util
254+
255+
if not importlib.util.find_spec("hsfs"):
256+
raise ValueError(
257+
"hsfs is not installed in the environment - cannot parse feature store artifacts"
258+
)
259+
if not importlib.util.find_spec("hopsworks"):
260+
raise ValueError(
261+
"hopsworks is not installed in the environment - cannot switch from hsml connection to hsfs connection"
262+
)
263+
264+
# make sure the hsfs connection is initialized so that the feature view/training dataset can actually be used after being returned
265+
import hopsworks
266+
267+
if not hopsworks._connected_project:
268+
raise Exception(
269+
"hopsworks connection is not initialized - use hopsworks.login to connect if you want the ability to use provenance with connections between hsfs and hsml"
270+
)
271+
272+
hopsworks._connected_project.get_feature_store()
273+
274+
links = Links.__from_response_json_feature_store_artifacts(
275+
json_dict, direction, artifact
276+
)
277+
return links
278+
279+
@staticmethod
280+
def __from_response_json_feature_store_artifacts(
281+
json_dict: dict, direction: Direction, artifact: Type
282+
):
283+
links_json = humps.decamelize(json_dict)
284+
if direction == Links.Direction.UPSTREAM:
285+
if artifact == Links.Type.FEATURE_VIEW:
286+
return Links.__parse_feature_views(
287+
links_json["upstream"],
288+
{
289+
"FEATURE_VIEW",
290+
},
291+
)
292+
elif artifact == Links.Type.TRAINING_DATASET:
293+
return Links.__parse_training_datasets(
294+
links_json["upstream"], {"TRAINING_DATASET"}
295+
)
296+
else:
297+
return Links()
298+
299+
300+
class ProvenanceEncoder(json.JSONEncoder):
301+
def default(self, obj):
302+
if isinstance(obj, Links):
303+
return {
304+
"accessible": obj.accessible,
305+
"inaccessible": obj.inaccessible,
306+
"deleted": obj.deleted,
307+
"faulty": obj.faulty,
308+
}
309+
else:
310+
import importlib.util
311+
312+
if importlib.util.find_spec("hsfs"):
313+
from hsfs import feature_view
314+
from hsfs.core import explicit_provenance as hsfs_explicit_provenance
315+
316+
if isinstance(
317+
obj,
318+
(
319+
feature_view.FeatureView,
320+
hsfs_explicit_provenance.Artifact,
321+
),
322+
):
323+
return {
324+
"feature_store_name": obj.feature_store_name,
325+
"name": obj.name,
326+
"version": obj.version,
327+
}
328+
return json.JSONEncoder.default(self, obj)

0 commit comments

Comments
 (0)