From be22083143edb797ee8e4e6fbf2698627b425dca Mon Sep 17 00:00:00 2001 From: John Readey Date: Tue, 4 Feb 2025 22:48:23 +0800 Subject: [PATCH] added objid functions --- pyproject.toml | 7 +- src/h5json/__init__.py | 8 + src/h5json/hdf5db.py | 21 +- src/h5json/objid.py | 485 ++++++++++++++++++++++++++++++++++++++++ test/unit/objid_test.py | 199 +++++++++++++++++ 5 files changed, 707 insertions(+), 13 deletions(-) create mode 100644 src/h5json/objid.py create mode 100755 test/unit/objid_test.py diff --git a/pyproject.toml b/pyproject.toml index bcba820..5ddb024 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,17 +19,18 @@ authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }] keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"] requires-python = ">=3.8" dependencies = [ - "h5py >=3.10", + "h5py >= 3.10", "numpy >= 2.0; python_version>='3.9'", "jsonschema >=4.4.0", "tomli; python_version<'3.11'", "numpy >=1.20,<2.0.0; python_version=='3.8'", ] + dynamic = ["version"] [project.urls] -Homepage = "https://hdf5-json.readthedocs.io" -Documentation = "https://hdf5-json.readthedocs.io" +Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" +Documentation = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" Source = "https://github.com/HDFGroup/hdf5-json" "Bug Reports" = "https://github.com/HDFGroup/hdf5-json/issues" Social = "https://twitter.com/hdf5" diff --git a/src/h5json/__init__.py b/src/h5json/__init__.py index 704d241..d4a7f78 100644 --- a/src/h5json/__init__.py +++ b/src/h5json/__init__.py @@ -21,6 +21,14 @@ from .hdf5dtype import getTypeResponse from .hdf5dtype import getItemSize from .hdf5dtype import createDataType +from .objid import createObjId +from .objid import getCollectionForId +from .objid import isObjId +from .objid import isS3ObjKey +from .objid import getS3Key +from .objid import getObjId +from .objid import isSchema2Id +from .objid import isRootObjId from .hdf5db import Hdf5db from . import _version diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 27f2094..676dbef 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -19,6 +19,7 @@ import json import logging from .hdf5dtype import getTypeItem, createDataType, getItemSize +from .objid import createObjId from .apiversion import _apiver @@ -561,7 +562,7 @@ def initFile(self): self.log.info("initializing file") if not self.root_uuid: - self.root_uuid = str(uuid.uuid1()) + self.root_uuid = createObjId() self.dbGrp.attrs["rootUUID"] = self.root_uuid self.dbGrp.create_group("{groups}") self.dbGrp.create_group("{datasets}") @@ -593,21 +594,21 @@ def visit(self, path, obj): msg = "Unknown object type: " + __name__ + " found during scan of HDF5 file" self.log.error(msg) raise IOError(errno.EIO, msg) - uuid1 = uuid.uuid1() # create uuid - id = str(uuid1) + obj_id = createObjId() # create uuid + addrGrp = self.dbGrp["{addr}"] if not self.readonly: # storing db in the file itself, so we can link to the object directly - col[id] = obj.ref # save attribute ref to object + col[obj_id] = obj.ref # save attribute ref to object else: # store path to object - col[id] = obj.name + col[obj_id] = obj.name addr = h5py.h5o.get_info(obj.id).addr # store reverse map as an attribute - addrGrp.attrs[str(addr)] = id + addrGrp.attrs[str(addr)] = obj_id # - # Get Datset creation properties + # Get Dataset creation properties # def getDatasetCreationProps(self, dset_uuid): prop_list = {} @@ -1087,7 +1088,7 @@ def createCommittedType(self, datatype, obj_uuid=None): raise IOError(errno.EPERM, msg) datatypes = self.dbGrp["{datatypes}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() dt = self.createTypeFromItem(datatype) datatypes[obj_uuid] = dt @@ -2715,7 +2716,7 @@ def createDataset( raise IOError(errno.EPERM, msg) datasets = self.dbGrp["{datasets}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() dt = None item = {} fillvalue = None @@ -3490,7 +3491,7 @@ def createGroup(self, obj_uuid=None): raise IOError(errno.EPERM, msg) groups = self.dbGrp["{groups}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() newGroup = groups.create_group(obj_uuid) # store reverse map as an attribute addr = h5py.h5o.get_info(newGroup.id).addr diff --git a/src/h5json/objid.py b/src/h5json/objid.py new file mode 100644 index 0000000..7a98a5b --- /dev/null +++ b/src/h5json/objid.py @@ -0,0 +1,485 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HDF (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# objID: +# id (uuid) related functions +# + + +import hashlib +import uuid + +S3_URI = "s3://" +FILE_URI = "file://" +AZURE_URI = "blob.core.windows.net/" # preceded with "https://" +UUID_LEN = 36 # length for uuid strings + + + +def _getStorageProtocol(uri): + """ returns 's3://', 'file://', or 'https://...net/' prefix if present. + If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer + (references Azure blob storage), return: https://myaccount.blob.core.windows.net/ + otherwise None """ + + if not uri: + protocol = None + elif uri.startswith(S3_URI): + protocol = S3_URI + elif uri.startswith(FILE_URI): + protocol = FILE_URI + elif uri.startswith("https://") and uri.find(AZURE_URI) > 0: + n = uri.find(AZURE_URI) + len(AZURE_URI) + protocol = uri[:n] + elif uri.find("://") >= 0: + raise ValueError(f"storage uri: {uri} not supported") + else: + protocol = None + return protocol + + +def _getBaseName(uri): + """ Return the part of the URI after the storage protocol (if any) """ + + protocol = _getStorageProtocol(uri) + if not protocol: + return uri + else: + return uri[len(protocol):] + +def _getPrefixForCollection(collection): + """ Return prefix character for given collection type """ + collection = collection.lower() + + if collection in ("group", "groups"): + return 'g' + elif collection in ("dataset", "datasets"): + return 'd' + elif collection in ("datatype", "datatypes"): + return 't' + elif collection in ("chunk", "chunks"): + return 'c' + else: + raise ValueError(f"unexpected collection type: {collection}") + + +def getIdHash(id): + """Return md5 prefix based on id value""" + m = hashlib.new("md5") + m.update(id.encode("utf8")) + hexdigest = m.hexdigest() + return hexdigest[:5] + + +def isSchema2Id(id): + """return true if this is a v2 id""" + # v1 ids are in the standard UUID format: 8-4-4-4-12 + # v2 ids are in the non-standard: 8-8-4-6-6 + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id formation for uuid: {id}") + if len(parts[2]) == 8: + return True + else: + return False + + +def getIdHexChars(id): + """get the hex chars of the given id""" + if id[0] == "c": + # don't include chunk index + index = id.index("_") + parts = id[0:index].split("-") + else: + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id format for uuid: {id}") + return "".join(parts[1:]) + + +def hexRot(ch): + """rotate hex character by 8""" + return format((int(ch, base=16) + 8) % 16, "x") + + +def isRootObjId(id): + """returns true if this is a root id (only for v2 schema)""" + if not isSchema2Id(id): + raise ValueError("isRootObjId can only be used with v2 ids") + validateUuid(id) # will throw ValueError exception if not a objid + if id[0] != "g": + return False # not a group + token = getIdHexChars(id) + # root ids will have last 16 chars rotated version of the first 16 + is_root = True + for i in range(16): + if token[i] != hexRot(token[i + 16]): + is_root = False + break + return is_root + + +def getRootObjId(id): + """returns root id for this objid if this is a root id + (only for v2 schema) + """ + if isRootObjId(id): + return id # this is the root id + token = list(getIdHexChars(id)) + # root ids will have last 16 chars rotated version of the first 16 + for i in range(16): + token[i + 16] = hexRot(token[i]) + token = "".join(token) + root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20] + root_id += "-" + token[20:26] + "-" + token[26:32] + + return root_id + + +def createObjId(obj_type=None, root_id=None): + """ create a new objid + + if obj_type is None, return just a bare uuid. + Otherwise a hsds v2 schema obj_id will be created. + In this case obj_type should be one of "groups", + "datasets", "datatypes", "chunks". If rootid is + None, a root group obj_id will be created. Otherwise the + obj_id will be a an id that has root_id as it's root. """ + + + prefix = None + if obj_type is None: + # just return a regular uuid + objid = str(uuid.uuid4()) + else: + + prefix = _getPrefixForCollection(obj_type) + # schema v2 + salt = uuid.uuid4().hex + # take a hash to randomize the uuid + token = list(hashlib.sha256(salt.encode()).hexdigest()) + + if root_id: + # replace first 16 chars of token with first 16 chars of root id + root_hex = getIdHexChars(root_id) + token[0:16] = root_hex[0:16] + else: + if obj_type != "groups": + raise ValueError("expected 'groups' obj_type for root group id") + # use only 16 chars, but make it look a 32 char id + for i in range(16): + token[16 + i] = hexRot(token[i]) + # format as a string + token = "".join(token) + objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-" + objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32] + + return objid + + +def getS3Key(id): + """Return s3 key for given id. + + For schema v1: + A md5 prefix is added to the front of the returned key to better + distribute S3 objects. + For schema v2: + The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and + "db/id[0:16]/{prefix}/id[16-32]" for other ids + Chunk ids have the chunk index added after the slash: + "db/id[0:16]/d/id[16:32]/x_y_z + + For domain id's: + Return a key with the .domain suffix and no preceding slash. + For non-default buckets, use the format: /s3_key + If the id has a storage specifier ("s3://", "file://", etc.) + include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5" + """ + + base_id = _getBaseName(id) # strip any s3://, etc. + if base_id.find("/") > 0: + # a domain id + domain_suffix = ".domain.json" + index = base_id.find("/") + 1 + key = base_id[index:] + if not key.endswith(domain_suffix): + if key[-1] != "/": + key += "/" + key += domain_suffix + else: + if isSchema2Id(id): + # schema v2 id + hexid = getIdHexChars(id) + prefix = id[0] # one of g, d, t, c + if prefix not in ("g", "d", "t", "c"): + raise ValueError(f"Unexpected id: {id}") + + if isRootObjId(id): + key = f"db/{hexid[0:8]}-{hexid[8:16]}" + else: + partition = "" + if prefix == "c": + # use 'g' so that chunks will show up under their dataset + s3col = "d" + n = id.find("-") + if n > 1: + # extract the partition index if present + partition = "p" + id[1:n] + else: + s3col = prefix + key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}" + key += f"-{hexid[20:26]}-{hexid[26:32]}" + if prefix == "c": + if partition: + key += "/" + key += partition + # add the chunk coordinate + index = id.index("_") # will raise ValueError if not found + n = index + 1 + coord = id[n:] + key += "/" + key += coord + elif prefix == "g": + # add key suffix for group + key += "/.group.json" + elif prefix == "d": + # add key suffix for dataset + key += "/.dataset.json" + else: + # add key suffix for datatype + key += "/.datatype.json" + else: + # v1 id + # schema v1 id + idhash = getIdHash(id) + key = f"{idhash}-{id}" + + return key + + +def getObjId(s3key): + """Return object id given valid s3key""" + if all( + ( + len(s3key) >= 44 and s3key[0:5].isalnum(), + len(s3key) >= 44 and s3key[5] == "-", + len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"), + ) + ): + # v1 obj keys + objid = s3key[6:] + elif s3key.endswith("/.domain.json"): + objid = "/" + s3key[: -(len("/.domain.json"))] + elif s3key.startswith("db/"): + # schema v2 object key + parts = s3key.split("/") + chunk_coord = "" # used only for chunk ids + partition = "" # likewise + token = [] + for ch in parts[1]: + if ch != "-": + token.append(ch) + + if len(parts) == 3: + # root id + # last part should be ".group.json" + if parts[2] != ".group.json": + raise ValueError(f"unexpected S3Key: {s3key}") + # add 16 more chars using rotated version of first 16 + for i in range(16): + token.append(hexRot(token[i])) + prefix = "g" + elif len(parts) == 5: + # group, dataset, or datatype or chunk + for ch in parts[3]: + if ch != "-": + token.append(ch) + + if parts[2] == "g" and parts[4] == ".group.json": + prefix = "g" # group json + elif parts[2] == "t" and parts[4] == ".datatype.json": + prefix = "t" # datatype json + elif parts[2] == "d": + if parts[4] == ".dataset.json": + prefix = "d" # dataset json + else: + # chunk object + prefix = "c" + chunk_coord = "_" + parts[4] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + elif len(parts) == 6: + # chunk key with partitioning + for ch in parts[3]: + if ch != "-": + token.append(ch) + if parts[2][0] != "d": + raise ValueError(f"unexpected S3Key: {s3key}") + prefix = "c" + partition = parts[4] + if partition[0] != "p": + raise ValueError(f"unexpected S3Key: {s3key}") + partition = partition[1:] # strip off the p + chunk_coord = "_" + parts[5] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + + token = "".join(token) + objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16] + objid += "-" + token[16:20] + "-" + token[20:26] + "-" + objid += token[26:32] + chunk_coord + else: + msg = f"unexpected S3Key: {s3key}" + raise ValueError(msg) + return objid + + +def isS3ObjKey(s3key): + """ return True if this is a storage key """ + valid = False + try: + objid = getObjId(s3key) + if objid: + valid = True + except KeyError: + pass # ignore + except ValueError: + pass # ignore + return valid + + +def getCollectionForId(obj_id): + """return groups/datasets/datatypes based on id""" + if not isinstance(obj_id, str): + raise ValueError("invalid object id") + collection = None + if obj_id.startswith("g-"): + collection = "groups" + elif obj_id.startswith("d-"): + collection = "datasets" + elif obj_id.startswith("t-"): + collection = "datatypes" + else: + raise ValueError("not a collection id") + return collection + + +def validateUuid(id, obj_class=None): + """ verify the UUID is well-formed + schema can be: + None: expecting ordinary UUID + "v1": expecting HSDS v1 format + "v2": expecting HSDS v2 format + if set obj_class can be one of "groups", "datasets", "datatypes" + """ + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) < UUID_LEN: + raise ValueError("id is too short to be an object identifier") + if len(id) == UUID_LEN: + if obj_class: + # expected a prefix + raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") + else: + # does this have a v1 schema hash tag? + # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + if id[:5].isalnum() and id[5] == '-': + id = id[6:] # trim off the hash tag + # validate prefix + if id[0] not in ("g", "d", "t", "c"): + raise ValueError("Unexpected prefix") + if id[0] != "c" and id[1] != "-": + # chunk ids may have a partition index following the c + raise ValueError("Unexpected prefix") + if obj_class is not None: + obj_class = obj_class.lower() + if id[0] != _getPrefixForCollection(obj_class): + raise ValueError(f"unexpected object id {id} for collection: {obj_class}") + if id[0] == "c": + # trim the type char and any partition id + n = id.find("-") + if n == -1: + raise ValueError("Invalid chunk id") + + # trim the chunk index for chunk ids + m = id.find("_") + if m == -1: + raise ValueError("Invalid chunk id") + n += 1 + id = "c-" + id[n:m] + id = id[2:] + if len(id) != UUID_LEN: + # id should be 36 now + raise ValueError("Unexpected id length") + + for ch in id: + if ch.isalnum(): + continue + if ch == "-": + continue + raise ValueError(f"Unexpected character in uuid: {ch}") + + +def isValidUuid(id, obj_class=None): + try: + validateUuid(id, obj_class) + return True + except ValueError: + return False + + +def isValidChunkId(id): + if not isValidUuid(id): + return False + if id[0] != "c": + return False + return True + + +def getClassForObjId(id): + """return domains/chunks/groups/datasets/datatypes based on id""" + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) == 0: + raise ValueError("Empty string") + if id[0] == "/": + return "domains" + if isValidChunkId(id): + return "chunks" + else: + return getCollectionForId(id) + + +def isObjId(id): + """return true if uuid or domain""" + if not isinstance(id, str) or len(id) == 0: + return False + if id.find("/") > 0: + # domain id is any string in the form / + return True + return isValidUuid(id) + + +def getUuidFromId(id): + """strip off the type prefix ('g-' or 'd-', or 't-') + and return the uuid part""" + if len(id) == UUID_LEN: + # just a uuid + return id + elif len(id) == UUID_LEN + 2: + # 'g-', 'd-', or 't-' prefix + return id[2:] + else: + raise ValueError(f"Unexpected obj_id: {id}") + + + \ No newline at end of file diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py new file mode 100755 index 0000000..7c02482 --- /dev/null +++ b/test/unit/objid_test.py @@ -0,0 +1,199 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import sys + +from h5json.objid import isRootObjId, isValidUuid, validateUuid +from h5json.objid import createObjId, getCollectionForId +from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id + + +class IdUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(IdUtilTest, self).__init__(*args, **kwargs) + # main + + def testCreateObjId(self): + id_len = 38 # 36 for uuid plus two for prefix ("g-", "d-") + ids = set() # we'll use this to verify we always get a unique id + # create just a plain uuid... + id = createObjId() + self.assertEqual(len(id) + 2, id_len) + # create a v2 root_id + root_id = createObjId(obj_type="groups") + self.assertEqual(len(root_id), id_len) + for obj_type in ("groups", "datasets", "datatypes", "chunks"): + for i in range(100): + id = createObjId(obj_type=obj_type, root_id=root_id) + self.assertEqual(len(id), id_len) + self.assertTrue(id[0] in ("g", "d", "t", "c")) + self.assertEqual(id[1], "-") + ids.add(id) + + self.assertEqual(len(ids), 400) + try: + createObjId(obj_type="bad_class") + self.assertTrue(False) # should throw exception + except ValueError: + pass # expected + + def testIsValidUuid(self): + group1_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" # orig schema + group2_id = "g-314d61b8-995411e6-a733-3c15c2-da029e" + root_id = "g-f9aaa28e-d42e10e5-7122-2a065c-a6986d" + dataset1_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" # orig schema + dataset2_id = "d-4c48f3ae-995411e6-a3cd-3c15c2-da029e" + ctype1_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" # orig schema + ctype2_id = "t-8c785f1c-995311e6-9bc2-0242ac-110005" + chunk1_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2" # orig schema + chunk2_id = "c-8c785f1c-995311e6-9bc2-0242ac-110005_7_2" + domain_id = "mybucket/bob/mydata.h5" + s3_domain_id = "s3://mybucket/bob/mydata.h5" + file_domain_id = "file://mybucket/bob/mydata.h5" + azure_domain_id = "https://myaccount.blob.core.windows.net/mybucket/bob/mydata.h5" + valid_id_map = { + group1_id: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + group2_id: "db/314d61b8-995411e6/g/a733-3c15c2-da029e/.group.json", + dataset1_id: "26928-d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e", + dataset2_id: "db/4c48f3ae-995411e6/d/a3cd-3c15c2-da029e/.dataset.json", + ctype1_id: "5a9cf-t-8c785f1c-9953-11e6-9bc2-0242ac110005", + ctype2_id: "db/8c785f1c-995311e6/t/9bc2-0242ac-110005/.datatype.json", + chunk1_id: "dc4ce-c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2", + chunk2_id: "db/8c785f1c-995311e6/d/9bc2-0242ac-110005/7_2", + domain_id: "bob/mydata.h5/.domain.json", + s3_domain_id: "bob/mydata.h5/.domain.json", + file_domain_id: "bob/mydata.h5/.domain.json", + azure_domain_id: "bob/mydata.h5/.domain.json", } + + bad_ids = ("g-1e76d862", "/bob/mydata.h5") + + self.assertTrue(isValidUuid(group1_id)) + self.assertFalse(isSchema2Id(group1_id)) + self.assertTrue(isValidUuid(group1_id, obj_class="Group")) + self.assertTrue(isValidUuid(group1_id, obj_class="group")) + self.assertTrue(isValidUuid(group1_id, obj_class="groups")) + self.assertTrue(isSchema2Id(root_id)) + self.assertTrue(isValidUuid(root_id, obj_class="Group")) + self.assertTrue(isValidUuid(root_id, obj_class="group")) + self.assertTrue(isValidUuid(root_id, obj_class="groups")) + self.assertTrue(isRootObjId(root_id)) + self.assertTrue(isValidUuid(dataset1_id, obj_class="datasets")) + self.assertFalse(isSchema2Id(dataset1_id)) + self.assertTrue(isValidUuid(ctype1_id, obj_class="datatypes")) + self.assertFalse(isSchema2Id(ctype1_id)) + self.assertTrue(isValidUuid(chunk1_id, obj_class="chunks")) + self.assertFalse(isSchema2Id(chunk1_id)) + self.assertTrue(isValidUuid(group2_id)) + self.assertTrue(isSchema2Id(group2_id)) + self.assertTrue(isValidUuid(group2_id, obj_class="Group")) + self.assertTrue(isValidUuid(group2_id, obj_class="group")) + self.assertTrue(isValidUuid(group2_id, obj_class="groups")) + self.assertFalse(isRootObjId(group2_id)) + self.assertTrue(isValidUuid(dataset2_id, obj_class="datasets")) + self.assertTrue(isSchema2Id(dataset2_id)) + self.assertTrue(isValidUuid(ctype2_id, obj_class="datatypes")) + self.assertTrue(isSchema2Id(ctype2_id)) + self.assertTrue(isValidUuid(chunk2_id, obj_class="chunks")) + self.assertTrue(isSchema2Id(chunk2_id)) + validateUuid(group1_id) + try: + isRootObjId(group1_id) + self.assertTrue(False) + except ValueError: + # only works for v2 schema + pass # expected + + for item in valid_id_map: + self.assertTrue(isObjId(item)) + s3key = getS3Key(item) + self.assertTrue(s3key[0] != "/") + self.assertTrue(isS3ObjKey(s3key)) + expected = valid_id_map[item] + self.assertEqual(s3key, expected) + if item.find("/") > 0: + continue # bucket name gets lost when domain ids get converted to s3keys + objid = getObjId(s3key) + self.assertEqual(objid, item) + for item in bad_ids: + self.assertFalse(isValidUuid(item)) + self.assertFalse(isObjId(item)) + + def testGetCollection(self): + group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" + dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" + ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" + bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e" + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + try: + getCollectionForId(bad_id) + self.assertTrue(False) + except ValueError: + pass # expected + try: + getCollectionForId(None) + self.assertTrue(False) + except ValueError: + pass # expected + + def testSchema2Id(self): + root_id = createObjId("groups") + group_id = createObjId("groups", root_id=root_id) + dataset_id = createObjId("datasets", root_id=root_id) + ctype_id = createObjId("datatypes", root_id=root_id) + + self.assertEqual(getCollectionForId(root_id), "groups") + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + chunk_id = "c" + dataset_id[1:] + "_1_2" + chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2" + + for id in (chunk_id, chunk_partition_id): + try: + getCollectionForId(id) + self.assertTrue(False) + except ValueError: + pass # expected + valid_ids = ( + group_id, + dataset_id, + ctype_id, + chunk_id, + chunk_partition_id, + root_id, + ) + s3prefix = getS3Key(root_id) + self.assertTrue(s3prefix.endswith("/.group.json")) + s3prefix = s3prefix[: -(len(".group.json"))] + for oid in valid_ids: + self.assertTrue(len(oid) >= 38) + parts = oid.split("-") + self.assertEqual(len(parts), 6) + self.assertTrue(oid[0] in ("g", "d", "t", "c")) + self.assertTrue(isSchema2Id(oid)) + if oid == root_id: + self.assertTrue(isRootObjId(oid)) + else: + self.assertFalse(isRootObjId(oid)) + + s3key = getS3Key(oid) + self.assertTrue(s3key.startswith(s3prefix)) + self.assertEqual(getObjId(s3key), oid) + self.assertTrue(isS3ObjKey(s3key)) + + +if __name__ == "__main__": + # setup test files + + unittest.main()