diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst
index 7b134fe5d9e..1b434b962d0 100644
--- a/docs/source/cuda/memory.rst
+++ b/docs/source/cuda/memory.rst
@@ -162,11 +162,15 @@ unlike traditional dynamic memory management.
 
    Allocate a local array of the given *shape* and *type* on the device.
    *shape* is either an integer or a tuple of integers representing the array's
-   dimensions and must be a simple constant expression. *type* is a
-   :ref:`Numba type <numba-types>` of the elements needing to be stored in the
-   array. The array is private to the current thread. An array-like object is
-   returned which can be read and written to like any standard array
-   (e.g. through indexing).
+   dimensions and must be a simple constant expression. *type* is a :ref:`Numba
+   type <numba-types>` of the elements needing to be stored in the array. The
+   array is private to the current thread. An array-like object is returned
+   which can be read and written to like any standard array (e.g. through
+   indexing).
+
+   .. seealso:: The Local Memory section of `Device Memory Accesses
+      <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses>`_
+      in the CUDA programming guide.
 
 Constant memory
 ===============
diff --git a/numba/core/errors.py b/numba/core/errors.py
index 7f52311ec2a..b69321cd332 100644
--- a/numba/core/errors.py
+++ b/numba/core/errors.py
@@ -349,7 +349,7 @@ def termcolor():
 Unsupported functionality was found in the code Numba was trying to compile.
 
 If this functionality is important to you please file a feature request at:
-https://github.com/numba/numba/issues/new
+https://github.com/numba/numba/issues/new?template=feature_request.md
 """
 
 interpreter_error_info = """
@@ -359,7 +359,8 @@ def termcolor():
 environment variable to non-zero, and then rerun the code).
 
 If the code is valid and the unsupported functionality is important to you
-please file a feature request at: https://github.com/numba/numba/issues/new
+please file a feature request at:
+https://github.com/numba/numba/issues/new?template=feature_request.md
 
 To see Python/NumPy features supported by the latest release of Numba visit:
 https://numba.pydata.org/numba-doc/latest/reference/pysupported.html
@@ -376,7 +377,8 @@ def termcolor():
 https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs
 
 If the code is valid and the unsupported functionality is important to you
-please file a feature request at: https://github.com/numba/numba/issues/new
+please file a feature request at:
+https://github.com/numba/numba/issues/new?template=feature_request.md
 
 If you think your code should work with Numba. %s
 """ % feedback_details
@@ -395,7 +397,7 @@ def termcolor():
 
 If you think your code should work with Numba, please report the error message
 and traceback, along with a minimal reproducer at:
-https://github.com/numba/numba/issues/new
+https://github.com/numba/numba/issues/new?template=bug_report.md
 """
 
 reportable_issue_info = """
diff --git a/numba/cuda/kernels/device/helper.py b/numba/cuda/kernels/device/helper.py
new file mode 100644
index 00000000000..3ff059b3d7e
--- /dev/null
+++ b/numba/cuda/kernels/device/helper.py
@@ -0,0 +1,355 @@
+"""Helper functions for distance matrix computations."""
+import os
+from math import sqrt
+from numba import jit
+from time import time
+
+inline = os.environ.get("INLINE", "never")
+
+
+class Timer(object):
+    """
+    Simple timer class.
+
+    https://stackoverflow.com/a/5849861/13697228
+    Usage
+    -----
+    with Timer("description"):
+        # do stuff
+    """
+
+    def __init__(self, name=None):
+        self.name = name
+
+    def __enter__(self):
+        """Enter the timer."""
+        self.tstart = time()
+
+    def __exit__(self, type, value, traceback):
+        """Exit the timer."""
+        if self.name:
+            print("[%s]" % self.name,)
+        print(("Elapsed: {}\n").format(round((time() - self.tstart), 5)))
+
+
+@jit(inline=inline)
+def copy(a, out):
+    """
+    Copy a into out.
+
+    Parameters
+    ----------
+    a : vector
+        values to copy.
+    out : vector
+        values to copy into.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(a)):
+        i = int(i)
+        out[i] = a[i]
+
+
+@jit(inline=inline)
+def insertionSort(arr):
+    """
+    Perform insertion sorting on a vector.
+
+    Source: https://www.geeksforgeeks.org/insertion-sort/
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array to be sorted in-place.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(1, len(arr)):
+        key = arr[i]
+
+        # Move elements of arr[0..i-1], that are greater than key, to one
+        # position ahead of their current position
+        j = i - 1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            j -= 1
+        arr[j + 1] = key
+
+
+@jit(inline=inline)
+def insertionArgSort(arr, ids):
+    """
+    Perform insertion sorting on a vector and track the sorting indices.
+
+    Source: https://www.geeksforgeeks.org/insertion-sort/
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array to be sorted in-place.
+
+    ids : 1D array
+        Initialized array to fill with sorting indices.
+
+    Returns
+    -------
+    None.
+
+    """
+    # fill ids
+    for i in range(len(arr)):
+        ids[i] = i
+
+    for i in range(1, len(arr)):
+        key = arr[i]
+        id_key = ids[i]
+
+        # Move elements of arr[0..i-1], that are greater than key, to one
+        # position ahead of their current position
+        j = i - 1
+        while j >= 0 and key < arr[j]:
+            arr[j + 1] = arr[j]
+            ids[j + 1] = ids[j]
+            j -= 1
+        arr[j + 1] = key
+        ids[j + 1] = id_key
+
+
+@jit(inline=inline)
+def concatenate(vec, vec2, out):
+    """
+    Concatenate two vectors.
+
+    It also reassigns the values of out back into vec and vec2 to prevent odd
+    roundoff issues.
+
+    Parameters
+    ----------
+    vec : vector
+        First vector to concatenate.
+    vec2 : vector
+        Second vector to concatenate.
+    out : vector
+        Initialization of concatenated vector.
+
+    Returns
+    -------
+    None.
+
+    """
+    # number of elements
+    n = len(vec)
+    n2 = len(vec2)
+    # populate out
+    for i in range(n):
+        out[i] = vec[i]
+    for i in range(n2):
+        out[i + n] = vec2[i]
+
+
+@jit(inline=inline)
+def diff(vec, out):
+    """
+    Compute gradient.
+
+    Parameters
+    ----------
+    vec : vector of floats
+        The vector for which to calculate the gradient.
+    out : vector of floats
+        Initialization of output vector (one less element than vec) which
+        contains the gradient.
+
+    Returns
+    -------
+    None.
+
+    """
+    # number of elements
+    n = len(vec) - 1
+    # progressively add next elements
+    for i in range(n):
+        out[i] = vec[i + 1] - vec[i]
+
+# maybe there's a faster implementation somewhere
+
+
+@jit(inline=inline)
+def bisect_right(a, v, ids):
+    """
+    Return indices where to insert items in v in list a, assuming a is sorted.
+
+    Parameters
+    ----------
+    arr : 1D array
+        Array for which to perform a binary search.
+    v : 1D array
+        Values to perform the binary search for.
+    ids : 1D array
+        Initialized list of indices, same size as v.
+
+    Returns
+    -------
+    None.
+
+    The return value i is such that all e in a[:i] have e <= x, and all e in
+    a[i:] have e > x.  So if x already appears in the list, a.insert(x) will
+    insert just after the rightmost x already there. Optional args lo
+    (default 0) and hi (default len(a)) bound the slice of a to be searched.
+
+    Source: modified from
+    https://github.com/python/cpython/blob/43bab0537ceb6e2ca3597f8f3a3c79733b897434/Lib/bisect.py#L15-L35 # noqa
+    """
+    n = len(a)
+    n2 = len(v)
+    for i in range(n2):
+        lo = 0
+        mid = 0
+        hi = n
+        x = v[i]
+        while lo < hi:
+            mid = (lo + hi) // 2
+            # Use __lt__ to match the logic in list.sort() and in heapq
+            if x < a[mid]:
+                hi = mid
+            else:
+                lo = mid + 1
+        ids[i] = lo
+
+
+@jit(inline=inline)
+def sort_by_indices(v, ids, out):
+    """
+    Sort v by ids and assign to out, as in out = v[ids].
+
+    Parameters
+    ----------
+    v : vector
+        values to sort.
+    ids : vector
+        indices to use for sorting.
+    out : vector
+        preallocated vector to put sorted values into.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(ids)):
+        idx = ids[i]
+        out[i] = v[idx]
+
+
+@jit(inline=inline)
+def cumsum(vec, out):
+    """
+    Return the cumulative sum of the elements in a vector.
+
+    Referenced https://stackoverflow.com/a/15889203/13697228
+
+    Parameters
+    ----------
+    vec : 1D array
+        Vector for which to compute the cumulative sum.
+
+    out: 1D array
+        Initialized vector which stores the cumulative sum.
+
+    Returns
+    -------
+    out : 1D array
+        The cumulative sum of elements in vec (same shape as vec).
+
+    """
+    # number of elements
+    n = len(vec)
+    # initialize
+    total = 0.0
+    # progressively add next elements
+    for i in range(n):
+        total += vec[i]
+        out[i] = total
+
+
+@jit(inline=inline)
+def divide(v, b, out):
+    """
+    Divide a vector by a scalar.
+
+    Parameters
+    ----------
+    v : vector
+        vector numerator.
+    b : float
+        scalar denominator.
+    out : vector
+        initialized vector to populate with divided values.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i in range(len(v)):
+        out[i] = v[i] / b
+
+
+@jit(inline=inline)
+def integrate(u_cdf, v_cdf, deltas, p):
+    """
+    Integrate between two vectors using a p-norm.
+
+    Parameters
+    ----------
+    u_cdf : numeric vector
+        First CDF.
+    v_cdf : numeric vector
+        Second CDF of same length as a.
+    deltas : numeric vector
+        The differences between the original vectors a and b.
+    p : int, optional
+        The finite power for which to compute the p-norm. The default is 2.
+
+    If p == 1 or p == 2, use of power is avoided, which introduces an overhead
+    # of about 15%
+    Source: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404-L8488 # noqa
+    However, without support for math.square or np.square (2021-08-19),
+    math.pow is required for p == 2.
+
+    Returns
+    -------
+    out : numeric scalar
+        The p-norm of a and b.
+
+    """
+    n = len(u_cdf)
+    out = 0.0
+    if p == 1:
+        for i in range(n):
+            out += abs(u_cdf[i] - v_cdf[i]) * deltas[i]
+    elif p == 2:
+        for i in range(n):
+            out += ((u_cdf[i] - v_cdf[i]) ** p) * deltas[i]
+            out = sqrt(out)
+    elif p > 2:
+        for i in range(n):
+            out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i]
+            out = out ** (1 / p)
+    return out
+
+
+# %% CODE GRAVEYARD
+# from numba.core import config
+# config_keys = dir(config)
+# if "INLINE" in config_keys:
+#     inline = config.INLINE
+# else:
+#     inline = "never"
diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py
new file mode 100644
index 00000000000..df96846c7f6
--- /dev/null
+++ b/numba/cuda/kernels/dist_matrix.py
@@ -0,0 +1,777 @@
+"""
+Compute a distance matrix for various cases.
+
+Cases:
+    within a single set of vectors (like pdist)
+    between two sets of vectors (like cdist)
+    between prespecified pairs (i.e. sparse) for either one or two sets of
+        vectors.
+
+Various distance metrics are available.
+"""
+import numba.cuda.kernels.device.helper as hp
+import os
+import numpy as np
+from math import sqrt, ceil
+
+# CUDA Simulator not working
+# os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1"
+
+from numba import cuda, jit, prange  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+from numba.types import int32, float32, int64, float64  # noqa
+# HACK from numba.cuda.kernels.device.myjit import get_myjit
+# from numba.cuda.cudadrv.driver import driver
+# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash..
+
+inline = os.environ.get("INLINE", "never")
+fastmath = bool(os.environ.get("FASTMATH", "1"))
+cols = os.environ.get("COLUMNS")
+USE_64 = bool(os.environ.get("USE_64", "0"))
+target = os.environ.get("TARGET", "cuda")
+
+if USE_64 is None:
+    USE_64 = False
+if USE_64:
+    bits = 64
+    nb_float = float64
+    nb_int = int64
+    np_float = np.float64
+    np_int = np.int64
+else:
+    bits = 32
+    nb_float = float32
+    nb_int = int32
+    np_float = np.float32
+    np_int = np.int32
+os.environ["MACHINE_BITS"] = str(bits)
+
+if cols is not None:
+    cols = int(cols)
+    cols_plus_1 = cols + 1
+    tot_cols = cols * 2
+    tot_cols_minus_1 = tot_cols - 1
+else:
+    raise KeyError("For performance reasons and architecture constraints "
+                   "the number of columns of U (which is the same as V) "
+                   "must be defined as the environment variable, COLUMNS, "
+                   "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.")
+
+# override types
+if target == "cpu":
+    nb_float = np_float
+    nb_int = np_int
+
+# HACK
+# a "hacky" way to get compatibility between @njit and @cuda.jit
+# myjit = get_myjit(target=target, inline=inline)
+# mycudajit = get_myjit(device=False, target=target, inline=inline)
+
+# local array shapes needs to be defined globally due to lack of dynamic
+# array allocation support. Don't wrap with np.int32, etc. see
+# https://github.com/numba/numba/issues/7314
+TPB = 16
+
+# np.savetxt('100-zeros.csv',
+#           np.zeros(tmp, dtype=np.int32),
+#           delimiter=",")
+# OK to take cols from .shape
+# cols = np.genfromtxt('100-zeros.csv',
+#                      dtype=np.int32,
+#                      delimiter=",").shape[0]
+
+
+@ jit(inline=inline)
+def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended):  # noqa
+    r""" # noqa
+    Compute distance between two 1D distributions :math:`u` and :math:`v`.
+
+    The respective CDFs are :math:`U` and :math:`V`, and the
+    statistical distance is defined as:
+    .. math::
+        l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p}
+    p is a positive parameter; p = 1 gives the Wasserstein distance,
+    p = 2 gives the energy distance.
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like
+        Weight for each value.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from
+        1, it must still be positive and finite so that the weights can
+        be normalized to sum to 1.
+    p : scalar
+        positive parameter that determines the type of cdf distance.
+    presorted : bool
+        Whether u and v have been sorted already *and* u_weights and
+        v_weights have been sorted using the same indices used to sort
+        u and v, respectively.
+    cumweighted : bool
+        Whether u_weights and v_weights have been converted to their
+        cumulative weights via e.g. np.cumsum().
+    prepended : bool
+        Whether a zero has been prepended to accumated, sorted
+        u_weights and v_weights.
+
+    By setting presorted, cumweighted, *and* prepended to False, the
+    computationproceeds proceeds in the same fashion as _cdf_distance
+    from scipy.stats.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from
+    samples whose values are effectively inputs of the function, or
+    they can be seen as generalized functions, in which case they are
+    weighted sums of Dirac delta functions located at the specified
+    values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan,
+            Hoyer, Munos "The Cramer Distance as a Solution to Biased
+            Wasserstein Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    # allocate local float arrays
+    # combined vector
+    uv = cuda.local.array(tot_cols, nb_float)
+    uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # CDFs
+    u_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+    v_cdf = cuda.local.array(tot_cols_minus_1, nb_float)
+
+    # allocate local int arrays
+    # CDF indices via binary search
+    u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+    v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int)
+
+    u_cdf_sorted_cumweights = cuda.local.array(
+        tot_cols_minus_1, nb_float)
+    v_cdf_sorted_cumweights = cuda.local.array(
+        tot_cols_minus_1, nb_float)
+
+    # short-circuit
+    if presorted and cumweighted and prepended:
+        u_sorted = u
+        v_sorted = v
+
+        u_0_cumweights = u_weights
+        v_0_cumweights = v_weights
+
+    # sorting, accumulating, and prepending (for compatibility)
+    else:
+        # check arguments
+        if not presorted and (cumweighted or prepended):
+            raise ValueError(
+                "if cumweighted or prepended are True, then presorted cannot be False")  # noqa
+
+        if (not presorted or not cumweighted) and prepended:
+            raise ValueError(
+                "if prepended is True, then presorted and cumweighted must both be True")  # noqa
+
+        # sorting
+        if not presorted:
+            # local arrays
+            u_sorted = cuda.local.array(cols, np_float)
+            v_sorted = cuda.local.array(cols, np_float)
+
+            u_sorter = cuda.local.array(cols, nb_int)
+            v_sorter = cuda.local.array(cols, nb_int)
+
+            u_sorted_weights = cuda.local.array(cols, nb_float)
+            v_sorted_weights = cuda.local.array(cols, nb_float)
+
+            # local copy since quickArgSortIterative sorts in-place
+            hp.copy(u, u_sorted)
+            hp.copy(v, v_sorted)
+
+            # sorting
+            hp.insertionArgSort(u_sorted, u_sorter)
+            hp.insertionArgSort(v_sorted, v_sorter)
+
+            # inplace to avoid extra cuda local array
+            hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights)
+            hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights)
+
+        # cumulative weights
+        if not cumweighted:
+            # local arrays
+            u_cumweights = cuda.local.array(cols, nb_float)
+            v_cumweights = cuda.local.array(cols, nb_float)
+            # accumulate
+            hp.cumsum(u_sorted_weights, u_cumweights)
+            hp.cumsum(v_sorted_weights, v_cumweights)
+
+        # prepend weights with zero
+        if not prepended:
+            zero = cuda.local.array(1, nb_float)
+
+            u_0_cumweights = cuda.local.array(
+                cols_plus_1, nb_float)
+            v_0_cumweights = cuda.local.array(
+                cols_plus_1, nb_float)
+
+            hp.concatenate(zero, u_cumweights, u_0_cumweights)
+            hp.concatenate(zero, v_cumweights, v_0_cumweights)
+
+    # concatenate u and v into uv
+    hp.concatenate(u_sorted, v_sorted, uv)
+
+    # sorting
+    # quickSortIterative(uv, uv_stack)
+    hp.insertionSort(uv)
+
+    # Get the respective positions of the values of u and v among the
+    # values of both distributions. See also np.searchsorted
+    hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices)
+    hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices)
+
+    # empirical CDFs
+    hp.sort_by_indices(u_0_cumweights, u_cdf_indices,
+                       u_cdf_sorted_cumweights)
+    hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf)
+
+    hp.sort_by_indices(v_0_cumweights, v_cdf_indices,
+                       v_cdf_sorted_cumweights)
+    hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf)
+
+    # # Integration
+    hp.diff(uv, uv_deltas)  # See also np.diff
+
+    out = hp.integrate(u_cdf, v_cdf, uv_deltas, p)
+
+    return out
+
+
+@ jit(inline=inline)
+def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended):  # noqa
+    r"""
+    Compute the first Wasserstein distance between two 1D distributions.
+
+    This distance is also known as the earth mover's distance, since it can be
+    seen as the minimum amount of "work" required to transform :math:`u` into
+    :math:`v`, where "work" is measured as the amount of distribution weight
+    that must be moved, multiplied by the distance it has to be moved.
+
+    Source
+    ------
+    https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa
+
+    Parameters
+    ----------
+    u_values, v_values : array_like
+        Values observed in the (empirical) distribution.
+    u_weights, v_weights : array_like, optional
+        Weight for each value. If unspecified, each value is assigned the same
+        weight.
+        `u_weights` (resp. `v_weights`) must have the same length as
+        `u_values` (resp. `v_values`). If the weight sum differs from 1, it
+        must still be positive and finite so that the weights can be normalized
+        to sum to 1.
+
+    Returns
+    -------
+    distance : float
+        The computed distance between the distributions.
+
+    Notes
+    -----
+    The input distributions can be empirical, therefore coming from samples
+    whose values are effectively inputs of the function, or they can be seen as
+    generalized functions, in which case they are weighted sums of Dirac delta
+    functions located at the specified values.
+
+    References
+    ----------
+    .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer,
+           Munos "The Cramer Distance as a Solution to Biased Wasserstein
+           Gradients" (2017). :arXiv:`1705.10743`.
+    """
+    return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended)  # noqa
+
+
+@ jit(inline=inline)
+def euclidean_distance(a, b):
+    """
+    Calculate Euclidean distance between vectors a and b.
+
+    Parameters
+    ----------
+    a : 1D array
+        First vector.
+    b : 1D array
+        Second vector.
+
+    Returns
+    -------
+    d : numeric scalar
+        Euclidean distance between vectors a and b.
+    """
+    d = 0
+    for i in range(len(a)):
+        d += (b[i] - a[i]) ** 2
+    d = sqrt(d)
+    return d
+
+
+@ jit(inline=inline)
+def compute_distance(u, v, u_weights, v_weights, metric_num):
+    """
+    Calculate weighted distance between two vectors, u and v.
+
+    Parameters
+    ----------
+    u : 1D array of float
+        First vector.
+    v : 1D array of float
+        Second vector.
+    u_weights : 1D array of float
+        Weights for u.
+    v_weights : 1D array of float
+        Weights for v.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Raises
+    ------
+    NotImplementedError
+        "Specified metric is mispelled or has not been implemented yet.
+        If not implemented, consider submitting a pull request."
+
+    Returns
+    -------
+    d : float
+        Weighted distance between u and v.
+
+    """
+    if metric_num == 0:
+        d = euclidean_distance(u, v)
+    elif metric_num == 1:
+        # assume u and v are presorted, and weights are sorted by u and v
+        d = wasserstein_distance(
+            u, v, u_weights, v_weights, True, True, True)
+    else:
+        raise NotImplementedError(
+            "Specified metric is mispelled or has not been implemented yet. "
+            "If not implemented, consider submitting a pull request."
+        )
+    return d
+
+
+@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+          "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath)
+def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num):
+    """
+    Calculate sparse pairwise distances between two sets of vectors for pairs.
+
+    Parameters
+    ----------
+    mat : numeric cuda array
+        First set of vectors for which to compute a single pairwise distance.
+    mat2 : numeric cuda array
+        Second set of vectors for which to compute a single pairwise distance.
+    pairs : cuda array of 2-tuples
+        All pairs for which distances are to be computed.
+    out : numeric cuda array
+        The initialized array which will be populated with distances.
+
+    Raises
+    ------
+    ValueError
+        Both matrices should have the same number of columns.
+
+    Returns
+    -------
+    None.
+
+    """
+    k = cuda.grid(1)
+
+    npairs = pairs.shape[0]
+
+    if k < npairs:
+        pair = pairs[k]
+        # extract indices of the pair for which the distance will be computed
+        i, j = pair
+
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+
+        out[k] = compute_distance(u, v, uw, vw, metric_num)
+
+
+@cuda.jit(
+    "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits),
+    fastmath=fastmath,
+)
+def one_set_distance_matrix(U, U_weights, out, metric_num):
+    """
+    Calculate pairwise distances within single set of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Initialized matrix to populate with pairwise distances.
+    metric_num : int
+        Which metric to use (0 == "euclidean", 1=="wasserstein").
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    dm_rows = U.shape[0]
+    dm_cols = U.shape[0]
+
+    if i < j and i < dm_rows and j < dm_cols and i != j:
+        u = U[i]
+        v = U[j]
+        uw = U_weights[i]
+        vw = U_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+        out[j, i] = d
+
+
+# faster compilation *and* runtimes with explicit signature
+@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], "
+          "float{0}[:,:], int{0})".format(bits), fastmath=fastmath)
+def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num):
+    """
+    Calculate pairwise distances between two sets of vectors.
+
+    Parameters
+    ----------
+    U : 2D array of float
+        Vertically stacked vectors.
+    V : 2D array of float
+        Vertically stacked vectors.
+    U_weights : 2D array of float
+        Vertically stacked weight vectors.
+    V_weights : 2D array of float
+        Vertically stacked weight vectors.
+    out : 2D array of float
+        Pairwise distance matrix between two sets of vectors.
+    metric_num : int
+        Distance metric number {"euclidean": 0, "wasserstein": 1}.
+
+    Returns
+    -------
+    None.
+
+    """
+    i, j = cuda.grid(2)
+
+    # distance matrix shape
+    dm_rows = U.shape[0]
+    dm_cols = V.shape[0]
+
+    if i < dm_rows and j < dm_cols:
+        u = U[i]
+        v = V[j]
+        uw = U_weights[i]
+        vw = V_weights[j]
+        d = compute_distance(u, v, uw, vw, metric_num)
+        out[i, j] = d
+
+
+def dist_matrix(U,
+                V=None,
+                U_weights=None,
+                V_weights=None,
+                pairs=None,
+                metric="euclidean"):  # noqa
+    """
+    Compute distance matrices using Numba/CUDA.
+
+    Parameters
+    ----------
+    mat : array
+        First set of vectors for which to compute pairwise distances.
+
+    mat2 : array, optional
+        Second set of vectors for which to compute pairwise distances.
+        If not specified, then mat2 is a copy of mat.
+
+    pairs : array, optional
+        List of 2-tuples which contain the indices for which to compute
+        distances. If mat2 was specified, then the second index accesses
+        mat2 instead of mat. If not specified, then the pairs are
+        auto-generated. If mat2 was specified, all combinations of the two
+        vector sets are used. If mat2 isn't specified, then only the upper
+        triangle (minus diagonal) pairs are computed.
+
+    metric : str, optional
+        Possible options are 'euclidean', 'wasserstein'.
+        Defaults to Euclidean distance. These are converted to integers
+        internally due to Numba's lack of support for string arguments
+        (2021-08-14). See compute_distance() for other keys.
+
+        For example:
+            0 - 'euclidean'
+            1 - 'wasserstein'
+    target : str, optional
+        Which target to use: "cuda" or "cpu". Default is "cuda".
+    inline : str, optional
+        Whether to inline functions: "always" or "never". Default is "never".
+    fastmath : bool, optional
+        Whether to use fastmath or not. The default is True.
+    USE_64 : bool, optional
+        Whether to use 64 bit or 34 bit types. The default is False.
+
+    Raises
+    ------
+    ValueError
+        DESCRIPTION.
+    NotImplementedError
+        DESCRIPTION.
+
+    Returns
+    -------
+    out : array
+        A pairwise distance matrix, or if pairs are specified, then a
+        vector of distances corresponding to the pairs.
+
+    """
+    rows, cols_check = U.shape
+
+    if cols_check != cols:
+        raise KeyError("For performance reasons and architecture constraints "
+                       "the number of columns of U (which is the same as V) "
+                       "must be defined as the environment variable: COLUMNS. "
+                       "However, os.environ[\"COLUMNS\"] does not match the "
+                       "number of columns in U. Reset the environment variable "
+                       "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` "
+                       "defined at the top of your script and make sure that "
+                       "dist_matrix is reloaded. You may also need to restart "
+                       "Python.")
+
+    if V is not None and cols is not V.shape[1]:
+        raise ValueError("The number of columns for U and V should match")
+
+    # is it a distance matrix between two sets of vectors?
+    # (rather than within a single set)
+    isXY = V is not None
+
+    # were pairs specified? (useful for sparse matrix generation)
+    pairQ = pairs is not None
+
+    # block, grid, and out shape
+    if pairQ:
+        block_dim = TPB
+        npairs = pairs.shape[0]
+        grid_dim = ceil(npairs / block_dim)
+    else:
+        block_dim = (TPB, TPB)
+        blockspergrid_x = ceil(rows / block_dim[0])
+        if isXY:
+            blockspergrid_y = ceil(V.shape[0] / block_dim[1])
+        else:
+            blockspergrid_y = ceil(rows / block_dim[1])
+        grid_dim = (blockspergrid_x, blockspergrid_y)
+
+    # CUDA setup
+    stream = cuda.stream()
+
+    # sorting and cumulative weights
+    if metric == "wasserstein":
+        # presort values (and weights by sorted value indices)
+        U_sorter = np.argsort(U)
+        U = np.take_along_axis(U, U_sorter, axis=-1)
+        U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1)
+
+        # calculate cumulative weights
+        U_weights = np.cumsum(U_weights, axis=1)
+
+        # prepend a column of zeros
+        zero = np.zeros((U_weights.shape[0], 1))
+        U_weights = np.column_stack((zero, U_weights))
+
+        # do the same for V and V_weights
+        if isXY:
+            V_sorter = np.argsort(V)
+            V = np.take_along_axis(V, V_sorter, axis=-1)
+            V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1)
+            V_weights = np.cumsum(V_weights, axis=1)
+            V_weights = np.column_stack((zero, V_weights))
+
+    # assign dummy arrays
+    if V is None:
+        V = np.zeros((2, 2))
+
+    if U_weights is None:
+        U_weights = np.zeros((2, 2))
+
+    if V_weights is None:
+        V_weights = np.zeros((2, 2))
+
+    if pairs is None:
+        pairs = np.zeros((2, 2))
+
+    # assign metric_num based on specified metric (no string support)
+    metric_dict = {"euclidean": 0, "wasserstein": 1}
+    metric_num = metric_dict[metric]
+
+    # # same for target_num
+    # target_dict = {"cuda": 0, "cpu": 1}
+    # target_num = target_dict[target]
+
+    m = U.shape[0]
+
+    if isXY:
+        m2 = V.shape[0]
+    else:
+        m2 = m
+
+    if pairQ:
+        shape = (npairs,)
+    else:
+        shape = (m, m2)
+
+    # values should be initialized instead of using cuda.device_array
+    out = np.zeros(shape)
+
+    if target == "cuda":
+        # copying/allocating to GPU
+        cU = cuda.to_device(np.asarray(
+            U, dtype=np_float), stream=stream)
+
+        cU_weights = cuda.to_device(np.asarray(
+            U_weights, dtype=np_float), stream=stream)
+
+        if isXY or pairQ:
+            cV = cuda.to_device(np.asarray(
+                V, dtype=np_float), stream=stream)
+            cV_weights = cuda.to_device(
+                np.asarray(V_weights, dtype=np_float), stream=stream
+            )
+
+        if pairQ:
+            cpairs = cuda.to_device(np.asarray(
+                pairs, dtype=np_int), stream=stream)
+
+        cuda_out = cuda.to_device(np.asarray(
+            out, dtype=np_float), stream=stream)
+
+    elif target == "cpu":
+        cU = U
+        cV = V
+        cU_weights = U_weights
+        cV_weights = V_weights
+        cpairs = pairs
+        cuda_out = out
+
+    # distance matrix between two sets of vectors
+    if isXY and not pairQ:
+        fn = two_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif not isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU, cU_weights, cU_weights, cpairs,
+           cuda_out, metric_num)
+
+    # distance matrix within single set of vectors
+    elif not isXY and not pairQ:
+        fn = one_set_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cU_weights, cuda_out, metric_num)
+
+    # specified pairwise distances within single set of vectors
+    elif isXY and pairQ:
+        fn = sparse_distance_matrix
+        if target == "cuda":
+            fn = fn[grid_dim, block_dim]
+        fn(cU, cV, cU_weights, cV_weights, cpairs,
+           cuda_out, metric_num)
+
+    out = cuda_out.copy_to_host(stream=stream)
+
+    return out
+
+# %% Code Graveyard
+
+# u_stack = cuda.local.array(cols, nb_int)
+# v_stack = cuda.local.array(cols, nb_int)
+
+# copy(u_sorted, utmp)
+# copy(v_sorted, vtmp)
+
+# copy(u_weights, u_sorted_weights)
+# copy(v_weights, v_sorted_weights)
+
+# stack = [0] * size
+# ids = list(range(len(arr)))
+
+# u_sorted_weights = cuda.local.array(cols, nb_float)
+# v_sorted_weights = cuda.local.array(cols, nb_float)
+
+# sorting stack
+# uv_stack = cuda.local.array(tot_cols, nb_int)
+
+# def sort_cum_prepend(X, X_weights):
+#     # presort values (and weights by sorted value indices)
+#     X_sorter = np.argsort(X)
+#     X = np.take_along_axis(X, X_sorter, axis=-1)
+#     X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1)
+#     # calculate cumulative weights
+#     X_weights = np.cumsum(X_weights)
+#     # prepend a column of zeros
+#     zero = np.zeros((X_weights.shape[0], 1))
+#     X_weights = np.column_stack((zero, X_weights))
+
+# def mean_squared_error(y, y_pred, squared=False):
+#     """
+#     Return mean squared error (MSE) without using sklearn.
+
+#     If squared == True, then return root mean squared error (RMSE).
+
+#     Parameters
+#     ----------
+#     y : 1D numeric array
+#         "True" or first set of values.
+#     y_pred : 1D numeric array
+#         "Predicted" or second set of values.
+
+#     Returns
+#     -------
+#     rmse : numeric scalar
+#         DESCRIPTION.
+
+#     """
+#     mse = np.mean((y - y_pred)**2)
+#     if squared is True:
+#         rmse = np.sqrt(mse)
+#         return rmse
+#     else:
+#         return mse
+#         return mse
+
+# opt = False
+# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0"
+# opt = True
+# os.environ["NUMBA_BOUNDSCHECK"] = "0"
+# os.environ["OPT"] = "0"
diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py
new file mode 100644
index 00000000000..4ad8f948a16
--- /dev/null
+++ b/numba/cuda/tests/cudapy/test_dist_matrix.py
@@ -0,0 +1,253 @@
+# -*- coding: utf-8 -*-
+"""Test distance matrix calculations using CUDA/Numba."""
+from scipy.spatial.distance import euclidean, cdist
+from scipy.stats import wasserstein_distance as scipy_wasserstein_distance
+from numpy.testing import assert_allclose
+import numpy as np
+import os
+from importlib import reload
+
+# os.environ["NUMBA_DISABLE_JIT"] = "1"
+
+# number of columns of U and V must be set as env var before import dist_matrix
+cols = 100
+os.environ["COLUMNS"] = str(cols)
+
+# other environment variables (set before importing dist_matrix)
+os.environ["USE_64"] = "0"
+os.environ["INLINE"] = "never"
+os.environ["FASTMATH"] = "1"
+os.environ["TARGET"] = "cuda"
+
+from numba.cuda.kernels.device.helper import Timer  # noqa
+from numba.cuda.testing import unittest, CUDATestCase  # noqa
+import numba.cuda.kernels.dist_matrix  # noqa
+
+# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228)
+reload(numba.cuda.kernels.dist_matrix)
+dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix
+
+verbose_test = True
+
+
+class TestDistMat(CUDATestCase):
+    """Test distance matrix calculations on GPU for various metrics."""
+
+    def test_dist_matrix(self):
+        """
+        Loop through distance metrics and perform unit tests.
+
+        The four test cases are:
+            pairwise distances within a single set of vectors
+            pairwise distances between two sets of vectors
+            sparse pairwise distances within a single set of vectors
+            sparse pairwise distances between two sets of vectors
+
+        The ground truth for Euclidean comes from cdist.
+        The ground truth for Earth Mover's distance (1-Wasserstein) is via
+        a scipy.stats function.
+
+        Helper functions are used to generate test data and support the use of
+        Wasserstein distances in cdist.
+
+        Returns
+        -------
+        None.
+
+        """
+
+        def test_data(rows, cols):
+            """
+            Generate seeded test values and weights for two distributions.
+
+            Returns
+            -------
+            U : 2D array
+                Values of first distribution.
+            V : 2D array
+                Values of second distribution.
+            U_weights : 2D array
+                Weights of first distribution.
+            V_weights : 2D array
+                Weights of second distribution.
+
+            """
+            np.random.seed(42)
+            [U, V, U_weights, V_weights] = [
+                np.random.rand(rows, cols) for i in range(4)]
+            return U, V, U_weights, V_weights
+
+        def my_wasserstein_distance(u_uw, v_vw):
+            """
+            Return Earth Mover's distance using concatenated values and weights.
+
+            Parameters
+            ----------
+            u_uw : 1D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            Returns
+            -------
+            distance : numeric scalar
+                Earth Mover's distance given two distributions.
+
+            """
+            # split into values and weights
+            n = len(u_uw)
+            i = n // 2
+            u = u_uw[0: i]
+            uw = u_uw[i: n]
+            v = v_vw[0: i]
+            vw = v_vw[i: n]
+            # calculate distance
+            distance = scipy_wasserstein_distance(
+                u, v, u_weights=uw, v_weights=vw)
+            return distance
+
+        def join_wasserstein(U, V, Uw, Vw):
+            """
+            Horizontally stack values and weights for each distribution.
+
+            Weights are added as additional columns to values.
+
+            Example:
+                u_uw, v_vw = join_wasserstein(u, v, uw, vw)
+                d = my_wasserstein_distance(u_uw, v_vw)
+                cdist(u_uw, v_vw, metric=my_wasserstein_distance)
+
+            Parameters
+            ----------
+            u : 1D or 2D numeric array
+                First set of distribution values.
+            v : 1D or 2D numeric array
+                Second set of values of distribution values.
+            uw : 1D or 2D numeric array
+                Weights for first distribution.
+            vw : 1D or 2D numeric array
+                Weights for second distribution.
+
+            Returns
+            -------
+            u_uw : 1D or 2D numeric array
+                Horizontally stacked values and weights of first distribution.
+            v_vw : TYPE
+                Horizontally stacked values and weights of second distribution.
+
+            """
+            U_Uw = np.concatenate((U, Uw), axis=1)
+            V_Vw = np.concatenate((V, Vw), axis=1)
+            return U_Uw, V_Vw
+
+        tol = 1e-5
+
+        # test and production data
+        pairs = np.array([(0, 1), (1, 2), (2, 3)])
+        i, j = pairs[0]
+
+        for testQ in [True, False]:
+            # large # of rows with testQ == True is slow due to use of
+            # cdist and non-jitted scipy-wasserstein
+            if testQ:
+                rows = 6
+            else:
+                rows = 1000
+            print("====[PARTIAL TEST WITH {} ROWS]====".format(rows))
+            U, V, U_weights, V_weights = test_data(rows, cols)
+            Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]]  # noqa
+
+            for target in ["cuda"]:  # "cpu" not implemented
+                for metric in ['euclidean', 'wasserstein']:
+                    print('[' + target.upper() + "_" + metric.upper() + ']')
+
+                    if testQ:
+                        # compile
+                        dist_matrix(Utest, U_weights=Uwtest, metric=metric)
+                        with Timer("one set"):
+                            one_set = dist_matrix(
+                                U, U_weights=U_weights, metric=metric)  # noqa
+                            if verbose_test:
+                                print(one_set, '\n')
+
+                    # compile
+                    dist_matrix(Utest, V=Vtest, U_weights=Uwtest,
+                                V_weights=Vwtest, metric=metric)
+                    with Timer("two set"):
+                        two_set = dist_matrix(U, V=V, U_weights=U_weights,
+                                              V_weights=V_weights, metric=metric)  # noqa
+                        if testQ and verbose_test:
+                            print(two_set, '\n')
+
+                        one_set_sparse = dist_matrix(
+                            U, U_weights=U_weights, pairs=pairs, metric=metric)  # noqa
+                        if testQ and verbose_test:
+                            print(one_set_sparse, '\n')
+
+                    two_set_sparse = dist_matrix(
+                        U,
+                        V=V,
+                        U_weights=U_weights,
+                        V_weights=V_weights,
+                        pairs=pairs,
+                        metric=metric,
+                    )
+                    if testQ and verbose_test:
+                        print(two_set_sparse, '\n')
+
+                    if testQ:
+                        if metric == "euclidean":
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(U, U)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(U, V)
+
+                            one_sparse_check = [
+                                euclidean(U[i], U[j]) for i, j in pairs]
+                            two_sparse_check = [
+                                euclidean(U[i], V[j]) for i, j in pairs]
+
+                        elif metric == "wasserstein":
+                            U_Uw, V_Vw = join_wasserstein(
+                                U, V, U_weights, V_weights)
+
+                            with Timer("one set check (cdist)"):
+                                one_set_check = cdist(
+                                    U_Uw, U_Uw, metric=my_wasserstein_distance)
+                            with Timer("two set check (cdist)"):
+                                two_set_check = cdist(
+                                    U_Uw, V_Vw, metric=my_wasserstein_distance)
+
+                            one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j])  # noqa
+                                                for i, j in pairs]
+                            two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j])  # noqa
+                                                for i, j in pairs]
+
+                        # check results
+                        assert_allclose(
+                            one_set.ravel(), one_set_check.ravel(), rtol=tol,
+                            err_msg="one set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            two_set.ravel(), two_set_check.ravel(), rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            one_set_sparse, one_sparse_check, rtol=tol,
+                            err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric))  # noqa
+                        assert_allclose(
+                            two_set_sparse, two_sparse_check, rtol=tol,
+                            err_msg="two set {} {} distance matrix inaccurate".format(target, metric))  # noqa
+                    elif metric == "euclidean":
+                        with Timer("two set check (cdist)"):
+                            two_set_check = cdist(U, V)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+# TF = [
+#     np.allclose(one_set, one_set_check),
+#     np.allclose(two_set, two_set_check),
+#     np.allclose(one_set_sparse, one_sparse_check),
+#     np.allclose(two_set_sparse, two_sparse_check)
+#     ]
diff --git a/numba/cuda/tests/cudapy/test_helper.py b/numba/cuda/tests/cudapy/test_helper.py
new file mode 100644
index 00000000000..67383ca9093
--- /dev/null
+++ b/numba/cuda/tests/cudapy/test_helper.py
@@ -0,0 +1,83 @@
+"""
+Test helper functions for distance matrix calculations.
+"""
+import os
+import numpy as np
+from numba.cuda.testing import unittest, CUDATestCase
+import numba.cuda.kernels.device.helper as hp
+from numpy.testing import assert_allclose, assert_equal
+
+bits = int(os.environ.get("MACHINE_BITS", "32"))
+
+if bits == 32:
+    np_float = np.float32
+    np_int = np.float32
+elif bits == 64:
+    np_float = np.float64
+    np_int = np.float64
+
+tol = 1e-5
+
+
+class TestHelperFunctions(CUDATestCase):
+    def test_concatenate(self):
+        vec = np.random.rand(5)
+        vec2 = np.random.rand(10)
+        check = np.concatenate((vec, vec2))
+        out = np.zeros(len(vec) + len(vec2), dtype=np_float)
+        hp.concatenate(vec, vec2, out)
+
+        assert_allclose(out, check)
+        vec[0] = 100.0
+        assert_allclose(out, check)
+
+    def test_diff(self):
+        vec = np.random.rand(10)
+        out = np.zeros(len(vec) - 1)
+        hp.diff(vec, out)
+        check = np.diff(vec)
+        assert_equal(out, check)
+
+    def test_bisect_right(self):
+        a = np.random.rand(10)
+        a.sort()
+        v = np.random.rand(5)
+        ids = np.zeros_like(v[:-1], dtype=np_int)
+        hp.bisect_right(a, v[:-1], ids)
+        check = np.searchsorted(a, v[:-1], side="right")
+        assert_equal(ids, check)
+
+    def test_sort_by_indices(self):
+        v = np.random.rand(10)
+        ids = np.arange(len(v))
+        np.random.shuffle(ids)
+        out = np.zeros_like(v)
+        hp.sort_by_indices(v, ids, out)
+        check = v[ids]
+        assert_equal(out, check)
+
+        v = np.array([0.0, 0.5528931, 1.1455898, 1.5933731])
+        ids = np.array([1, 2, 3, 3, 3])
+        out = np.zeros_like(ids, dtype=np_float)
+        hp.sort_by_indices(v, ids, out)
+        check = v[ids]
+        assert_allclose(out, check)
+
+    def test_cumsum(self):
+        vec = np.random.rand(10)
+        out = np.zeros_like(vec, dtype=np_float)
+        hp.cumsum(vec, out)
+        check = np.cumsum(vec)
+        assert_allclose(out, check)
+
+    def test_divide(self):
+        v = np.random.rand(10)
+        b = 4.62
+        out = np.zeros_like(v, dtype=np_float)
+        hp.divide(v, b, out)
+        check = v / b
+        assert_allclose(out, check)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/numba/tests/test_help.py b/numba/tests/test_help.py
index a78f05f0a3c..8199d4c93c4 100644
--- a/numba/tests/test_help.py
+++ b/numba/tests/test_help.py
@@ -89,4 +89,4 @@ def test_inspect_cli(self):
         with self.assertRaises(subprocess.CalledProcessError) as raises:
             subprocess.check_output(cmds, stderr=subprocess.STDOUT)
         self.assertIn("\'foo\' is not supported",
-                      raises.exception.stdout.decode('ascii'))
+                      raises.exception.stdout.decode())