From be22083143edb797ee8e4e6fbf2698627b425dca Mon Sep 17 00:00:00 2001
From: John Readey <jreadey@hdfgroup.org>
Date: Tue, 4 Feb 2025 22:48:23 +0800
Subject: [PATCH] added objid functions

---
 pyproject.toml          |   7 +-
 src/h5json/__init__.py  |   8 +
 src/h5json/hdf5db.py    |  21 +-
 src/h5json/objid.py     | 485 ++++++++++++++++++++++++++++++++++++++++
 test/unit/objid_test.py | 199 +++++++++++++++++
 5 files changed, 707 insertions(+), 13 deletions(-)
 create mode 100644 src/h5json/objid.py
 create mode 100755 test/unit/objid_test.py

diff --git a/pyproject.toml b/pyproject.toml
index bcba820..5ddb024 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,17 +19,18 @@ authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }]
 keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"]
 requires-python = ">=3.8"
 dependencies = [
-    "h5py >=3.10",
+    "h5py >= 3.10",
     "numpy >= 2.0; python_version>='3.9'",
     "jsonschema >=4.4.0",
     "tomli; python_version<'3.11'",
     "numpy >=1.20,<2.0.0; python_version=='3.8'",
 ]
+
 dynamic = ["version"]
 
 [project.urls]
-Homepage = "https://hdf5-json.readthedocs.io"
-Documentation = "https://hdf5-json.readthedocs.io"
+Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/"
+Documentation = "https://support.hdfgroup.org/documentation/hdf5-json/latest/"
 Source = "https://github.com/HDFGroup/hdf5-json"
 "Bug Reports" = "https://github.com/HDFGroup/hdf5-json/issues"
 Social = "https://twitter.com/hdf5"
diff --git a/src/h5json/__init__.py b/src/h5json/__init__.py
index 704d241..d4a7f78 100644
--- a/src/h5json/__init__.py
+++ b/src/h5json/__init__.py
@@ -21,6 +21,14 @@
 from .hdf5dtype import getTypeResponse
 from .hdf5dtype import getItemSize
 from .hdf5dtype import createDataType
+from .objid import createObjId
+from .objid import getCollectionForId
+from .objid import isObjId
+from .objid import isS3ObjKey
+from .objid import getS3Key
+from .objid import getObjId
+from .objid import isSchema2Id
+from .objid import isRootObjId
 from .hdf5db import Hdf5db
 from . import _version
 
diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py
index 27f2094..676dbef 100644
--- a/src/h5json/hdf5db.py
+++ b/src/h5json/hdf5db.py
@@ -19,6 +19,7 @@
 import json
 import logging
 from .hdf5dtype import getTypeItem, createDataType, getItemSize
+from .objid import createObjId
 from .apiversion import _apiver
 
 
@@ -561,7 +562,7 @@ def initFile(self):
 
         self.log.info("initializing file")
         if not self.root_uuid:
-            self.root_uuid = str(uuid.uuid1())
+            self.root_uuid = createObjId()
         self.dbGrp.attrs["rootUUID"] = self.root_uuid
         self.dbGrp.create_group("{groups}")
         self.dbGrp.create_group("{datasets}")
@@ -593,21 +594,21 @@ def visit(self, path, obj):
             msg = "Unknown object type: " + __name__ + " found during scan of HDF5 file"
             self.log.error(msg)
             raise IOError(errno.EIO, msg)
-        uuid1 = uuid.uuid1()  # create uuid
-        id = str(uuid1)
+        obj_id = createObjId()  # create uuid
+
         addrGrp = self.dbGrp["{addr}"]
         if not self.readonly:
             # storing db in the file itself, so we can link to the object directly
-            col[id] = obj.ref  # save attribute ref to object
+            col[obj_id] = obj.ref  # save attribute ref to object
         else:
             # store path to object
-            col[id] = obj.name
+            col[obj_id] = obj.name
         addr = h5py.h5o.get_info(obj.id).addr
         # store reverse map as an attribute
-        addrGrp.attrs[str(addr)] = id
+        addrGrp.attrs[str(addr)] = obj_id
 
     #
-    # Get Datset creation properties
+    # Get Dataset creation properties
     #
     def getDatasetCreationProps(self, dset_uuid):
         prop_list = {}
@@ -1087,7 +1088,7 @@ def createCommittedType(self, datatype, obj_uuid=None):
             raise IOError(errno.EPERM, msg)
         datatypes = self.dbGrp["{datatypes}"]
         if not obj_uuid:
-            obj_uuid = str(uuid.uuid1())
+            obj_uuid = createObjId()
         dt = self.createTypeFromItem(datatype)
 
         datatypes[obj_uuid] = dt
@@ -2715,7 +2716,7 @@ def createDataset(
             raise IOError(errno.EPERM, msg)
         datasets = self.dbGrp["{datasets}"]
         if not obj_uuid:
-            obj_uuid = str(uuid.uuid1())
+            obj_uuid = createObjId()
         dt = None
         item = {}
         fillvalue = None
@@ -3490,7 +3491,7 @@ def createGroup(self, obj_uuid=None):
             raise IOError(errno.EPERM, msg)
         groups = self.dbGrp["{groups}"]
         if not obj_uuid:
-            obj_uuid = str(uuid.uuid1())
+            obj_uuid = createObjId()
         newGroup = groups.create_group(obj_uuid)
         # store reverse map as an attribute
         addr = h5py.h5o.get_info(newGroup.id).addr
diff --git a/src/h5json/objid.py b/src/h5json/objid.py
new file mode 100644
index 0000000..7a98a5b
--- /dev/null
+++ b/src/h5json/objid.py
@@ -0,0 +1,485 @@
+##############################################################################
+# Copyright by The HDF Group.                                                #
+# All rights reserved.                                                       #
+#                                                                            #
+# This file is part of HDF (HDF5 REST Server) Service, Libraries and      #
+# Utilities.  The full HDF5 REST Server copyright notice, including          #
+# terms governing use, modification, and redistribution, is contained in     #
+# the file COPYING, which can be found at the root of the source code        #
+# distribution tree.  If you do not have access to this file, you may        #
+# request a copy from help@hdfgroup.org.                                     #
+##############################################################################
+#
+# objID:
+# id (uuid) related functions
+#
+
+
+import hashlib
+import uuid
+
+S3_URI = "s3://"
+FILE_URI = "file://"
+AZURE_URI = "blob.core.windows.net/"  # preceded with "https://"
+UUID_LEN = 36  # length for uuid strings
+
+
+
+def _getStorageProtocol(uri):
+    """ returns 's3://', 'file://', or 'https://...net/' prefix if present.
+    If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer
+    (references Azure blob storage), return: https://myaccount.blob.core.windows.net/
+    otherwise None """
+
+    if not uri:
+        protocol = None
+    elif uri.startswith(S3_URI):
+        protocol = S3_URI
+    elif uri.startswith(FILE_URI):
+        protocol = FILE_URI
+    elif uri.startswith("https://") and uri.find(AZURE_URI) > 0:
+        n = uri.find(AZURE_URI) + len(AZURE_URI)
+        protocol = uri[:n]
+    elif uri.find("://") >= 0:
+        raise ValueError(f"storage uri: {uri} not supported")
+    else:
+        protocol = None
+    return protocol
+
+
+def _getBaseName(uri):
+    """ Return the part of the URI after the storage protocol (if any) """
+
+    protocol = _getStorageProtocol(uri)
+    if not protocol:
+        return uri
+    else:
+        return uri[len(protocol):]
+    
+def _getPrefixForCollection(collection):
+    """ Return prefix character for given collection type """
+    collection = collection.lower()
+
+    if collection in ("group", "groups"):
+        return 'g'
+    elif collection in ("dataset", "datasets"):
+        return 'd'
+    elif collection in ("datatype", "datatypes"):
+        return 't'
+    elif collection in ("chunk", "chunks"):
+        return 'c'
+    else:
+        raise ValueError(f"unexpected collection type: {collection}")
+
+
+def getIdHash(id):
+    """Return md5 prefix based on id value"""
+    m = hashlib.new("md5")
+    m.update(id.encode("utf8"))
+    hexdigest = m.hexdigest()
+    return hexdigest[:5]
+
+
+def isSchema2Id(id):
+    """return true if this is a v2 id"""
+    # v1 ids are in the standard UUID format: 8-4-4-4-12
+    # v2 ids are in the non-standard: 8-8-4-6-6
+    parts = id.split("-")
+    if len(parts) != 6:
+        raise ValueError(f"Unexpected id formation for uuid: {id}")
+    if len(parts[2]) == 8:
+        return True
+    else:
+        return False
+
+
+def getIdHexChars(id):
+    """get the hex chars of the given id"""
+    if id[0] == "c":
+        # don't include chunk index
+        index = id.index("_")
+        parts = id[0:index].split("-")
+    else:
+        parts = id.split("-")
+    if len(parts) != 6:
+        raise ValueError(f"Unexpected id format for uuid: {id}")
+    return "".join(parts[1:])
+
+
+def hexRot(ch):
+    """rotate hex character by 8"""
+    return format((int(ch, base=16) + 8) % 16, "x")
+
+
+def isRootObjId(id):
+    """returns true if this is a root id (only for v2 schema)"""
+    if not isSchema2Id(id):
+        raise ValueError("isRootObjId can only be used with v2 ids")
+    validateUuid(id)  # will throw ValueError exception if not a objid
+    if id[0] != "g":
+        return False  # not a group
+    token = getIdHexChars(id)
+    # root ids will have last 16 chars rotated version of the first 16
+    is_root = True
+    for i in range(16):
+        if token[i] != hexRot(token[i + 16]):
+            is_root = False
+            break
+    return is_root
+
+
+def getRootObjId(id):
+    """returns root id for this objid if this is a root id
+    (only for v2 schema)
+    """
+    if isRootObjId(id):
+        return id  # this is the root id
+    token = list(getIdHexChars(id))
+    # root ids will have last 16 chars rotated version of the first 16
+    for i in range(16):
+        token[i + 16] = hexRot(token[i])
+    token = "".join(token)
+    root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20]
+    root_id += "-" + token[20:26] + "-" + token[26:32]
+
+    return root_id
+
+
+def createObjId(obj_type=None, root_id=None):
+    """ create a new objid 
+    
+        if obj_type is None, return just a bare uuid.
+        Otherwise a hsds v2 schema obj_id will be created.
+        In this case obj_type should be one of "groups",
+        "datasets", "datatypes", "chunks".  If rootid is
+        None, a root group obj_id will be created.  Otherwise the 
+        obj_id will be a an id that has root_id as it's root.  """
+
+    
+    prefix = None
+    if obj_type is None:
+        # just return a regular uuid
+        objid = str(uuid.uuid4())
+    else:
+
+        prefix = _getPrefixForCollection(obj_type)
+        # schema v2
+        salt = uuid.uuid4().hex
+        # take a hash to randomize the uuid
+        token = list(hashlib.sha256(salt.encode()).hexdigest())
+
+        if root_id:
+            # replace first 16 chars of token with first 16 chars of root id
+            root_hex = getIdHexChars(root_id)
+            token[0:16] = root_hex[0:16]
+        else:
+            if obj_type != "groups":
+                raise ValueError("expected 'groups' obj_type for root group id")
+            # use only 16 chars, but make it look a 32 char id
+            for i in range(16):
+                token[16 + i] = hexRot(token[i])
+        # format as a string
+        token = "".join(token)
+        objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-"
+        objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32]
+
+    return objid
+
+
+def getS3Key(id):
+    """Return s3 key for given id.
+
+    For schema v1:
+        A md5 prefix is added to the front of the returned key to better
+        distribute S3 objects.
+    For schema v2:
+        The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and
+        "db/id[0:16]/{prefix}/id[16-32]" for other ids
+        Chunk ids have the chunk index added after the slash:
+        "db/id[0:16]/d/id[16:32]/x_y_z
+
+    For domain id's:
+        Return a key with the .domain suffix and no preceding slash.
+        For non-default buckets, use the format: <bucket_name>/s3_key
+        If the id has a storage specifier ("s3://", "file://", etc.)
+        include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5"
+    """
+
+    base_id = _getBaseName(id)  # strip any s3://, etc.
+    if base_id.find("/") > 0:
+        # a domain id
+        domain_suffix = ".domain.json"
+        index = base_id.find("/") + 1
+        key = base_id[index:]
+        if not key.endswith(domain_suffix):
+            if key[-1] != "/":
+                key += "/"
+            key += domain_suffix
+    else:
+        if isSchema2Id(id):
+            # schema v2 id
+            hexid = getIdHexChars(id)
+            prefix = id[0]  # one of g, d, t, c
+            if prefix not in ("g", "d", "t", "c"):
+                raise ValueError(f"Unexpected id: {id}")
+
+            if isRootObjId(id):
+                key = f"db/{hexid[0:8]}-{hexid[8:16]}"
+            else:
+                partition = ""
+                if prefix == "c":
+                    # use 'g' so that chunks will show up under their dataset
+                    s3col = "d"
+                    n = id.find("-")
+                    if n > 1:
+                        # extract the partition index if present
+                        partition = "p" + id[1:n]
+                else:
+                    s3col = prefix
+                key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}"
+                key += f"-{hexid[20:26]}-{hexid[26:32]}"
+            if prefix == "c":
+                if partition:
+                    key += "/"
+                    key += partition
+                # add the chunk coordinate
+                index = id.index("_")  # will raise ValueError if not found
+                n = index + 1
+                coord = id[n:]
+                key += "/"
+                key += coord
+            elif prefix == "g":
+                # add key suffix for group
+                key += "/.group.json"
+            elif prefix == "d":
+                # add key suffix for dataset
+                key += "/.dataset.json"
+            else:
+                # add key suffix for datatype
+                key += "/.datatype.json"
+        else:
+            # v1 id
+            # schema v1 id
+            idhash = getIdHash(id)
+            key = f"{idhash}-{id}"
+
+    return key
+
+
+def getObjId(s3key):
+    """Return object id given valid s3key"""
+    if all(
+        (
+            len(s3key) >= 44 and s3key[0:5].isalnum(),
+            len(s3key) >= 44 and s3key[5] == "-",
+            len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"),
+        )
+    ):
+        # v1 obj keys
+        objid = s3key[6:]
+    elif s3key.endswith("/.domain.json"):
+        objid = "/" + s3key[: -(len("/.domain.json"))]
+    elif s3key.startswith("db/"):
+        # schema v2 object key
+        parts = s3key.split("/")
+        chunk_coord = ""  # used only for chunk ids
+        partition = ""  # likewise
+        token = []
+        for ch in parts[1]:
+            if ch != "-":
+                token.append(ch)
+
+        if len(parts) == 3:
+            # root id
+            # last part should be ".group.json"
+            if parts[2] != ".group.json":
+                raise ValueError(f"unexpected S3Key: {s3key}")
+            # add 16 more chars using rotated version of first 16
+            for i in range(16):
+                token.append(hexRot(token[i]))
+            prefix = "g"
+        elif len(parts) == 5:
+            # group, dataset, or datatype or chunk
+            for ch in parts[3]:
+                if ch != "-":
+                    token.append(ch)
+
+            if parts[2] == "g" and parts[4] == ".group.json":
+                prefix = "g"  # group json
+            elif parts[2] == "t" and parts[4] == ".datatype.json":
+                prefix = "t"  # datatype json
+            elif parts[2] == "d":
+                if parts[4] == ".dataset.json":
+                    prefix = "d"  # dataset json
+                else:
+                    # chunk object
+                    prefix = "c"
+                    chunk_coord = "_" + parts[4]
+            else:
+                raise ValueError(f"unexpected S3Key: {s3key}")
+        elif len(parts) == 6:
+            # chunk key with partitioning
+            for ch in parts[3]:
+                if ch != "-":
+                    token.append(ch)
+            if parts[2][0] != "d":
+                raise ValueError(f"unexpected S3Key: {s3key}")
+            prefix = "c"
+            partition = parts[4]
+            if partition[0] != "p":
+                raise ValueError(f"unexpected S3Key: {s3key}")
+            partition = partition[1:]  # strip off the p
+            chunk_coord = "_" + parts[5]
+        else:
+            raise ValueError(f"unexpected S3Key: {s3key}")
+
+        token = "".join(token)
+        objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16]
+        objid += "-" + token[16:20] + "-" + token[20:26] + "-"
+        objid += token[26:32] + chunk_coord
+    else:
+        msg = f"unexpected S3Key: {s3key}"
+        raise ValueError(msg)
+    return objid
+
+
+def isS3ObjKey(s3key):
+    """ return True if this is a storage key """
+    valid = False
+    try:
+        objid = getObjId(s3key)
+        if objid:
+            valid = True
+    except KeyError:
+        pass  # ignore
+    except ValueError:
+        pass  # ignore
+    return valid
+
+
+def getCollectionForId(obj_id):
+    """return groups/datasets/datatypes based on id"""
+    if not isinstance(obj_id, str):
+        raise ValueError("invalid object id")
+    collection = None
+    if obj_id.startswith("g-"):
+        collection = "groups"
+    elif obj_id.startswith("d-"):
+        collection = "datasets"
+    elif obj_id.startswith("t-"):
+        collection = "datatypes"
+    else:
+        raise ValueError("not a collection id")
+    return collection
+
+
+def validateUuid(id, obj_class=None):
+    """ verify the UUID is well-formed 
+        schema can be:
+           None: expecting ordinary UUID
+           "v1": expecting HSDS v1 format
+           "v2": expecting HSDS v2 format
+        if set obj_class can be one of "groups", "datasets", "datatypes"
+    """
+    if not isinstance(id, str):
+        raise ValueError("Expected string type")
+    if len(id) < UUID_LEN:
+        raise ValueError("id is too short to be an object identifier")
+    if len(id) == UUID_LEN:
+        if obj_class:
+            # expected a prefix
+            raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") 
+    else:
+        # does this have a v1 schema hash tag?
+        # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e",
+        if id[:5].isalnum() and id[5] == '-':
+            id = id[6:]  # trim off the hash tag
+        # validate prefix
+        if id[0] not in ("g", "d", "t", "c"):
+            raise ValueError("Unexpected prefix")
+        if id[0] != "c" and id[1] != "-":
+            # chunk ids may have a partition index following the c
+            raise ValueError("Unexpected prefix")
+        if obj_class is not None:
+            obj_class = obj_class.lower()
+            if id[0] != _getPrefixForCollection(obj_class):
+                raise ValueError(f"unexpected object id {id} for collection: {obj_class}")
+        if id[0] == "c":
+            # trim the type char and any partition id
+            n = id.find("-")
+            if n == -1:
+                raise ValueError("Invalid chunk id")
+
+            # trim the chunk index for chunk ids
+            m = id.find("_")
+            if m == -1:
+                raise ValueError("Invalid chunk id")
+            n += 1
+            id = "c-" + id[n:m]
+        id = id[2:]
+    if len(id) != UUID_LEN:
+        # id should be 36 now
+        raise ValueError("Unexpected id length")
+
+    for ch in id:
+        if ch.isalnum():
+            continue
+        if ch == "-":
+            continue
+        raise ValueError(f"Unexpected character in uuid: {ch}")
+
+
+def isValidUuid(id, obj_class=None):
+    try:
+        validateUuid(id, obj_class)
+        return True
+    except ValueError:
+        return False
+
+
+def isValidChunkId(id):
+    if not isValidUuid(id):
+        return False
+    if id[0] != "c":
+        return False
+    return True
+
+
+def getClassForObjId(id):
+    """return domains/chunks/groups/datasets/datatypes based on id"""
+    if not isinstance(id, str):
+        raise ValueError("Expected string type")
+    if len(id) == 0:
+        raise ValueError("Empty string")
+    if id[0] == "/":
+        return "domains"
+    if isValidChunkId(id):
+        return "chunks"
+    else:
+        return getCollectionForId(id)
+
+
+def isObjId(id):
+    """return true if uuid or domain"""
+    if not isinstance(id, str) or len(id) == 0:
+        return False
+    if id.find("/") > 0:
+        # domain id is any string in the form <bucket_name>/<domain_path>
+        return True
+    return isValidUuid(id)
+
+
+def getUuidFromId(id):
+    """strip off the type prefix ('g-' or 'd-', or 't-')
+    and return the uuid part"""
+    if len(id) == UUID_LEN:
+        # just a uuid
+        return id
+    elif len(id) == UUID_LEN + 2:
+        # 'g-', 'd-', or 't-' prefix
+        return id[2:]
+    else:
+        raise ValueError(f"Unexpected obj_id: {id}")
+    
+ 
+  
\ No newline at end of file
diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py
new file mode 100755
index 0000000..7c02482
--- /dev/null
+++ b/test/unit/objid_test.py
@@ -0,0 +1,199 @@
+##############################################################################
+# Copyright by The HDF Group.                                                #
+# All rights reserved.                                                       #
+#                                                                            #
+# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and      #
+# Utilities.  The full HSDS copyright notice, including                      #
+# terms governing use, modification, and redistribution, is contained in     #
+# the file COPYING, which can be found at the root of the source code        #
+# distribution tree.  If you do not have access to this file, you may        #
+# request a copy from help@hdfgroup.org.                                     #
+##############################################################################
+import unittest
+import sys
+
+from h5json.objid import isRootObjId, isValidUuid, validateUuid
+from h5json.objid import createObjId, getCollectionForId
+from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id
+
+
+class IdUtilTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(IdUtilTest, self).__init__(*args, **kwargs)
+        # main
+
+    def testCreateObjId(self):
+        id_len = 38  # 36 for uuid plus two for prefix ("g-", "d-")
+        ids = set()  # we'll use this to verify we always get a unique id
+        # create just a plain uuid...
+        id = createObjId()
+        self.assertEqual(len(id) + 2, id_len)
+        # create a v2 root_id
+        root_id = createObjId(obj_type="groups")
+        self.assertEqual(len(root_id), id_len)
+        for obj_type in ("groups", "datasets", "datatypes", "chunks"):
+            for i in range(100):
+                id = createObjId(obj_type=obj_type, root_id=root_id)
+                self.assertEqual(len(id), id_len)
+                self.assertTrue(id[0] in ("g", "d", "t", "c"))
+                self.assertEqual(id[1], "-")
+                ids.add(id)
+
+        self.assertEqual(len(ids), 400)
+        try:
+            createObjId(obj_type="bad_class")
+            self.assertTrue(False)  # should throw exception
+        except ValueError:
+            pass  # expected
+
+    def testIsValidUuid(self):
+        group1_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e"      # orig schema
+        group2_id = "g-314d61b8-995411e6-a733-3c15c2-da029e"
+        root_id = "g-f9aaa28e-d42e10e5-7122-2a065c-a6986d"
+        dataset1_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e"    # orig schema
+        dataset2_id = "d-4c48f3ae-995411e6-a3cd-3c15c2-da029e"
+        ctype1_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005"      # orig schema
+        ctype2_id = "t-8c785f1c-995311e6-9bc2-0242ac-110005"
+        chunk1_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2"  # orig schema
+        chunk2_id = "c-8c785f1c-995311e6-9bc2-0242ac-110005_7_2"
+        domain_id = "mybucket/bob/mydata.h5"
+        s3_domain_id = "s3://mybucket/bob/mydata.h5"
+        file_domain_id = "file://mybucket/bob/mydata.h5"
+        azure_domain_id = "https://myaccount.blob.core.windows.net/mybucket/bob/mydata.h5"
+        valid_id_map = {
+            group1_id: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e",
+            group2_id: "db/314d61b8-995411e6/g/a733-3c15c2-da029e/.group.json",
+            dataset1_id: "26928-d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e",
+            dataset2_id: "db/4c48f3ae-995411e6/d/a3cd-3c15c2-da029e/.dataset.json",
+            ctype1_id: "5a9cf-t-8c785f1c-9953-11e6-9bc2-0242ac110005",
+            ctype2_id: "db/8c785f1c-995311e6/t/9bc2-0242ac-110005/.datatype.json",
+            chunk1_id: "dc4ce-c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2",
+            chunk2_id: "db/8c785f1c-995311e6/d/9bc2-0242ac-110005/7_2",
+            domain_id: "bob/mydata.h5/.domain.json",
+            s3_domain_id: "bob/mydata.h5/.domain.json",
+            file_domain_id: "bob/mydata.h5/.domain.json",
+            azure_domain_id: "bob/mydata.h5/.domain.json", }
+
+        bad_ids = ("g-1e76d862", "/bob/mydata.h5")
+
+        self.assertTrue(isValidUuid(group1_id))
+        self.assertFalse(isSchema2Id(group1_id))
+        self.assertTrue(isValidUuid(group1_id, obj_class="Group"))
+        self.assertTrue(isValidUuid(group1_id, obj_class="group"))
+        self.assertTrue(isValidUuid(group1_id, obj_class="groups"))
+        self.assertTrue(isSchema2Id(root_id))
+        self.assertTrue(isValidUuid(root_id, obj_class="Group"))
+        self.assertTrue(isValidUuid(root_id, obj_class="group"))
+        self.assertTrue(isValidUuid(root_id, obj_class="groups"))
+        self.assertTrue(isRootObjId(root_id))
+        self.assertTrue(isValidUuid(dataset1_id, obj_class="datasets"))
+        self.assertFalse(isSchema2Id(dataset1_id))
+        self.assertTrue(isValidUuid(ctype1_id, obj_class="datatypes"))
+        self.assertFalse(isSchema2Id(ctype1_id))
+        self.assertTrue(isValidUuid(chunk1_id, obj_class="chunks"))
+        self.assertFalse(isSchema2Id(chunk1_id))
+        self.assertTrue(isValidUuid(group2_id))
+        self.assertTrue(isSchema2Id(group2_id))
+        self.assertTrue(isValidUuid(group2_id, obj_class="Group"))
+        self.assertTrue(isValidUuid(group2_id, obj_class="group"))
+        self.assertTrue(isValidUuid(group2_id, obj_class="groups"))
+        self.assertFalse(isRootObjId(group2_id))
+        self.assertTrue(isValidUuid(dataset2_id, obj_class="datasets"))
+        self.assertTrue(isSchema2Id(dataset2_id))
+        self.assertTrue(isValidUuid(ctype2_id, obj_class="datatypes"))
+        self.assertTrue(isSchema2Id(ctype2_id))
+        self.assertTrue(isValidUuid(chunk2_id, obj_class="chunks"))
+        self.assertTrue(isSchema2Id(chunk2_id))
+        validateUuid(group1_id)
+        try:
+            isRootObjId(group1_id)
+            self.assertTrue(False)
+        except ValueError:
+            # only works for v2 schema
+            pass  # expected
+
+        for item in valid_id_map:
+            self.assertTrue(isObjId(item))
+            s3key = getS3Key(item)
+            self.assertTrue(s3key[0] != "/")
+            self.assertTrue(isS3ObjKey(s3key))
+            expected = valid_id_map[item]
+            self.assertEqual(s3key, expected)
+            if item.find("/") > 0:
+                continue  # bucket name gets lost when domain ids get converted to s3keys
+            objid = getObjId(s3key)
+            self.assertEqual(objid, item)
+        for item in bad_ids:
+            self.assertFalse(isValidUuid(item))
+            self.assertFalse(isObjId(item))
+
+    def testGetCollection(self):
+        group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e"
+        dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e"
+        ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005"
+        bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e"
+        self.assertEqual(getCollectionForId(group_id), "groups")
+        self.assertEqual(getCollectionForId(dataset_id), "datasets")
+        self.assertEqual(getCollectionForId(ctype_id), "datatypes")
+        try:
+            getCollectionForId(bad_id)
+            self.assertTrue(False)
+        except ValueError:
+            pass  # expected
+        try:
+            getCollectionForId(None)
+            self.assertTrue(False)
+        except ValueError:
+            pass  # expected
+
+    def testSchema2Id(self):
+        root_id = createObjId("groups")
+        group_id = createObjId("groups", root_id=root_id)
+        dataset_id = createObjId("datasets", root_id=root_id)
+        ctype_id = createObjId("datatypes", root_id=root_id)
+
+        self.assertEqual(getCollectionForId(root_id), "groups")
+        self.assertEqual(getCollectionForId(group_id), "groups")
+        self.assertEqual(getCollectionForId(dataset_id), "datasets")
+        self.assertEqual(getCollectionForId(ctype_id), "datatypes")
+        chunk_id = "c" + dataset_id[1:] + "_1_2"
+        chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2"
+
+        for id in (chunk_id, chunk_partition_id):
+            try:
+                getCollectionForId(id)
+                self.assertTrue(False)
+            except ValueError:
+                pass  # expected
+        valid_ids = (
+            group_id,
+            dataset_id,
+            ctype_id,
+            chunk_id,
+            chunk_partition_id,
+            root_id,
+        )
+        s3prefix = getS3Key(root_id)
+        self.assertTrue(s3prefix.endswith("/.group.json"))
+        s3prefix = s3prefix[: -(len(".group.json"))]
+        for oid in valid_ids:
+            self.assertTrue(len(oid) >= 38)
+            parts = oid.split("-")
+            self.assertEqual(len(parts), 6)
+            self.assertTrue(oid[0] in ("g", "d", "t", "c"))
+            self.assertTrue(isSchema2Id(oid))
+            if oid == root_id:
+                self.assertTrue(isRootObjId(oid))
+            else:
+                self.assertFalse(isRootObjId(oid))
+
+            s3key = getS3Key(oid)
+            self.assertTrue(s3key.startswith(s3prefix))
+            self.assertEqual(getObjId(s3key), oid)
+            self.assertTrue(isS3ObjKey(s3key))
+
+
+if __name__ == "__main__":
+    # setup test files
+
+    unittest.main()