diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst index 7b134fe5d9e..1b434b962d0 100644 --- a/docs/source/cuda/memory.rst +++ b/docs/source/cuda/memory.rst @@ -162,11 +162,15 @@ unlike traditional dynamic memory management. Allocate a local array of the given *shape* and *type* on the device. *shape* is either an integer or a tuple of integers representing the array's - dimensions and must be a simple constant expression. *type* is a - :ref:`Numba type ` of the elements needing to be stored in the - array. The array is private to the current thread. An array-like object is - returned which can be read and written to like any standard array - (e.g. through indexing). + dimensions and must be a simple constant expression. *type* is a :ref:`Numba + type ` of the elements needing to be stored in the array. The + array is private to the current thread. An array-like object is returned + which can be read and written to like any standard array (e.g. through + indexing). + + .. seealso:: The Local Memory section of `Device Memory Accesses + `_ + in the CUDA programming guide. Constant memory =============== diff --git a/numba/core/errors.py b/numba/core/errors.py index 7f52311ec2a..b69321cd332 100644 --- a/numba/core/errors.py +++ b/numba/core/errors.py @@ -349,7 +349,7 @@ def termcolor(): Unsupported functionality was found in the code Numba was trying to compile. If this functionality is important to you please file a feature request at: -https://github.com/numba/numba/issues/new +https://github.com/numba/numba/issues/new?template=feature_request.md """ interpreter_error_info = """ @@ -359,7 +359,8 @@ def termcolor(): environment variable to non-zero, and then rerun the code). If the code is valid and the unsupported functionality is important to you -please file a feature request at: https://github.com/numba/numba/issues/new +please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md To see Python/NumPy features supported by the latest release of Numba visit: https://numba.pydata.org/numba-doc/latest/reference/pysupported.html @@ -376,7 +377,8 @@ def termcolor(): https://numba.pydata.org/numba-doc/latest/reference/pysupported.html?highlight=exceptions#constructs If the code is valid and the unsupported functionality is important to you -please file a feature request at: https://github.com/numba/numba/issues/new +please file a feature request at: +https://github.com/numba/numba/issues/new?template=feature_request.md If you think your code should work with Numba. %s """ % feedback_details @@ -395,7 +397,7 @@ def termcolor(): If you think your code should work with Numba, please report the error message and traceback, along with a minimal reproducer at: -https://github.com/numba/numba/issues/new +https://github.com/numba/numba/issues/new?template=bug_report.md """ reportable_issue_info = """ diff --git a/numba/cuda/kernels/device/helper.py b/numba/cuda/kernels/device/helper.py new file mode 100644 index 00000000000..3ff059b3d7e --- /dev/null +++ b/numba/cuda/kernels/device/helper.py @@ -0,0 +1,355 @@ +"""Helper functions for distance matrix computations.""" +import os +from math import sqrt +from numba import jit +from time import time + +inline = os.environ.get("INLINE", "never") + + +class Timer(object): + """ + Simple timer class. + + https://stackoverflow.com/a/5849861/13697228 + Usage + ----- + with Timer("description"): + # do stuff + """ + + def __init__(self, name=None): + self.name = name + + def __enter__(self): + """Enter the timer.""" + self.tstart = time() + + def __exit__(self, type, value, traceback): + """Exit the timer.""" + if self.name: + print("[%s]" % self.name,) + print(("Elapsed: {}\n").format(round((time() - self.tstart), 5))) + + +@jit(inline=inline) +def copy(a, out): + """ + Copy a into out. + + Parameters + ---------- + a : vector + values to copy. + out : vector + values to copy into. + + Returns + ------- + None. + + """ + for i in range(len(a)): + i = int(i) + out[i] = a[i] + + +@jit(inline=inline) +def insertionSort(arr): + """ + Perform insertion sorting on a vector. + + Source: https://www.geeksforgeeks.org/insertion-sort/ + + Parameters + ---------- + arr : 1D array + Array to be sorted in-place. + + Returns + ------- + None. + + """ + for i in range(1, len(arr)): + key = arr[i] + + # Move elements of arr[0..i-1], that are greater than key, to one + # position ahead of their current position + j = i - 1 + while j >= 0 and key < arr[j]: + arr[j + 1] = arr[j] + j -= 1 + arr[j + 1] = key + + +@jit(inline=inline) +def insertionArgSort(arr, ids): + """ + Perform insertion sorting on a vector and track the sorting indices. + + Source: https://www.geeksforgeeks.org/insertion-sort/ + + Parameters + ---------- + arr : 1D array + Array to be sorted in-place. + + ids : 1D array + Initialized array to fill with sorting indices. + + Returns + ------- + None. + + """ + # fill ids + for i in range(len(arr)): + ids[i] = i + + for i in range(1, len(arr)): + key = arr[i] + id_key = ids[i] + + # Move elements of arr[0..i-1], that are greater than key, to one + # position ahead of their current position + j = i - 1 + while j >= 0 and key < arr[j]: + arr[j + 1] = arr[j] + ids[j + 1] = ids[j] + j -= 1 + arr[j + 1] = key + ids[j + 1] = id_key + + +@jit(inline=inline) +def concatenate(vec, vec2, out): + """ + Concatenate two vectors. + + It also reassigns the values of out back into vec and vec2 to prevent odd + roundoff issues. + + Parameters + ---------- + vec : vector + First vector to concatenate. + vec2 : vector + Second vector to concatenate. + out : vector + Initialization of concatenated vector. + + Returns + ------- + None. + + """ + # number of elements + n = len(vec) + n2 = len(vec2) + # populate out + for i in range(n): + out[i] = vec[i] + for i in range(n2): + out[i + n] = vec2[i] + + +@jit(inline=inline) +def diff(vec, out): + """ + Compute gradient. + + Parameters + ---------- + vec : vector of floats + The vector for which to calculate the gradient. + out : vector of floats + Initialization of output vector (one less element than vec) which + contains the gradient. + + Returns + ------- + None. + + """ + # number of elements + n = len(vec) - 1 + # progressively add next elements + for i in range(n): + out[i] = vec[i + 1] - vec[i] + +# maybe there's a faster implementation somewhere + + +@jit(inline=inline) +def bisect_right(a, v, ids): + """ + Return indices where to insert items in v in list a, assuming a is sorted. + + Parameters + ---------- + arr : 1D array + Array for which to perform a binary search. + v : 1D array + Values to perform the binary search for. + ids : 1D array + Initialized list of indices, same size as v. + + Returns + ------- + None. + + The return value i is such that all e in a[:i] have e <= x, and all e in + a[i:] have e > x. So if x already appears in the list, a.insert(x) will + insert just after the rightmost x already there. Optional args lo + (default 0) and hi (default len(a)) bound the slice of a to be searched. + + Source: modified from + https://github.com/python/cpython/blob/43bab0537ceb6e2ca3597f8f3a3c79733b897434/Lib/bisect.py#L15-L35 # noqa + """ + n = len(a) + n2 = len(v) + for i in range(n2): + lo = 0 + mid = 0 + hi = n + x = v[i] + while lo < hi: + mid = (lo + hi) // 2 + # Use __lt__ to match the logic in list.sort() and in heapq + if x < a[mid]: + hi = mid + else: + lo = mid + 1 + ids[i] = lo + + +@jit(inline=inline) +def sort_by_indices(v, ids, out): + """ + Sort v by ids and assign to out, as in out = v[ids]. + + Parameters + ---------- + v : vector + values to sort. + ids : vector + indices to use for sorting. + out : vector + preallocated vector to put sorted values into. + + Returns + ------- + None. + + """ + for i in range(len(ids)): + idx = ids[i] + out[i] = v[idx] + + +@jit(inline=inline) +def cumsum(vec, out): + """ + Return the cumulative sum of the elements in a vector. + + Referenced https://stackoverflow.com/a/15889203/13697228 + + Parameters + ---------- + vec : 1D array + Vector for which to compute the cumulative sum. + + out: 1D array + Initialized vector which stores the cumulative sum. + + Returns + ------- + out : 1D array + The cumulative sum of elements in vec (same shape as vec). + + """ + # number of elements + n = len(vec) + # initialize + total = 0.0 + # progressively add next elements + for i in range(n): + total += vec[i] + out[i] = total + + +@jit(inline=inline) +def divide(v, b, out): + """ + Divide a vector by a scalar. + + Parameters + ---------- + v : vector + vector numerator. + b : float + scalar denominator. + out : vector + initialized vector to populate with divided values. + + Returns + ------- + None. + + """ + for i in range(len(v)): + out[i] = v[i] / b + + +@jit(inline=inline) +def integrate(u_cdf, v_cdf, deltas, p): + """ + Integrate between two vectors using a p-norm. + + Parameters + ---------- + u_cdf : numeric vector + First CDF. + v_cdf : numeric vector + Second CDF of same length as a. + deltas : numeric vector + The differences between the original vectors a and b. + p : int, optional + The finite power for which to compute the p-norm. The default is 2. + + If p == 1 or p == 2, use of power is avoided, which introduces an overhead + # of about 15% + Source: https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8404-L8488 # noqa + However, without support for math.square or np.square (2021-08-19), + math.pow is required for p == 2. + + Returns + ------- + out : numeric scalar + The p-norm of a and b. + + """ + n = len(u_cdf) + out = 0.0 + if p == 1: + for i in range(n): + out += abs(u_cdf[i] - v_cdf[i]) * deltas[i] + elif p == 2: + for i in range(n): + out += ((u_cdf[i] - v_cdf[i]) ** p) * deltas[i] + out = sqrt(out) + elif p > 2: + for i in range(n): + out += (u_cdf[i] - v_cdf[i]) ** p * deltas[i] + out = out ** (1 / p) + return out + + +# %% CODE GRAVEYARD +# from numba.core import config +# config_keys = dir(config) +# if "INLINE" in config_keys: +# inline = config.INLINE +# else: +# inline = "never" diff --git a/numba/cuda/kernels/dist_matrix.py b/numba/cuda/kernels/dist_matrix.py new file mode 100644 index 00000000000..df96846c7f6 --- /dev/null +++ b/numba/cuda/kernels/dist_matrix.py @@ -0,0 +1,777 @@ +""" +Compute a distance matrix for various cases. + +Cases: + within a single set of vectors (like pdist) + between two sets of vectors (like cdist) + between prespecified pairs (i.e. sparse) for either one or two sets of + vectors. + +Various distance metrics are available. +""" +import numba.cuda.kernels.device.helper as hp +import os +import numpy as np +from math import sqrt, ceil + +# CUDA Simulator not working +# os.environ["NUMBA_ENABLE_CUDASIM"] = "1" +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "1" + +from numba import cuda, jit, prange # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa +from numba.types import int32, float32, int64, float64 # noqa +# HACK from numba.cuda.kernels.device.myjit import get_myjit +# from numba.cuda.cudadrv.driver import driver +# TPB = driver.get_device().MAX_THREADS_PER_BLOCK # max TPB causes crash.. + +inline = os.environ.get("INLINE", "never") +fastmath = bool(os.environ.get("FASTMATH", "1")) +cols = os.environ.get("COLUMNS") +USE_64 = bool(os.environ.get("USE_64", "0")) +target = os.environ.get("TARGET", "cuda") + +if USE_64 is None: + USE_64 = False +if USE_64: + bits = 64 + nb_float = float64 + nb_int = int64 + np_float = np.float64 + np_int = np.int64 +else: + bits = 32 + nb_float = float32 + nb_int = int32 + np_float = np.float32 + np_int = np.int32 +os.environ["MACHINE_BITS"] = str(bits) + +if cols is not None: + cols = int(cols) + cols_plus_1 = cols + 1 + tot_cols = cols * 2 + tot_cols_minus_1 = tot_cols - 1 +else: + raise KeyError("For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable, COLUMNS, " + "via e.g. `os.environ[\"COLUMNS\"] = \"100\"`.") + +# override types +if target == "cpu": + nb_float = np_float + nb_int = np_int + +# HACK +# a "hacky" way to get compatibility between @njit and @cuda.jit +# myjit = get_myjit(target=target, inline=inline) +# mycudajit = get_myjit(device=False, target=target, inline=inline) + +# local array shapes needs to be defined globally due to lack of dynamic +# array allocation support. Don't wrap with np.int32, etc. see +# https://github.com/numba/numba/issues/7314 +TPB = 16 + +# np.savetxt('100-zeros.csv', +# np.zeros(tmp, dtype=np.int32), +# delimiter=",") +# OK to take cols from .shape +# cols = np.genfromtxt('100-zeros.csv', +# dtype=np.int32, +# delimiter=",").shape[0] + + +@ jit(inline=inline) +def cdf_distance(u, v, u_weights, v_weights, p, presorted, cumweighted, prepended): # noqa + r""" # noqa + Compute distance between two 1D distributions :math:`u` and :math:`v`. + + The respective CDFs are :math:`U` and :math:`V`, and the + statistical distance is defined as: + .. math:: + l_p(u, v) = \left( \int_{-\infty}^{+\infty} |U-V|^p \right)^{1/p} + p is a positive parameter; p = 1 gives the Wasserstein distance, + p = 2 gives the energy distance. + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like + Weight for each value. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from + 1, it must still be positive and finite so that the weights can + be normalized to sum to 1. + p : scalar + positive parameter that determines the type of cdf distance. + presorted : bool + Whether u and v have been sorted already *and* u_weights and + v_weights have been sorted using the same indices used to sort + u and v, respectively. + cumweighted : bool + Whether u_weights and v_weights have been converted to their + cumulative weights via e.g. np.cumsum(). + prepended : bool + Whether a zero has been prepended to accumated, sorted + u_weights and v_weights. + + By setting presorted, cumweighted, *and* prepended to False, the + computationproceeds proceeds in the same fashion as _cdf_distance + from scipy.stats. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from + samples whose values are effectively inputs of the function, or + they can be seen as generalized functions, in which case they are + weighted sums of Dirac delta functions located at the specified + values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, + Hoyer, Munos "The Cramer Distance as a Solution to Biased + Wasserstein Gradients" (2017). :arXiv:`1705.10743`. + """ + # allocate local float arrays + # combined vector + uv = cuda.local.array(tot_cols, nb_float) + uv_deltas = cuda.local.array(tot_cols_minus_1, nb_float) + + # CDFs + u_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + v_cdf = cuda.local.array(tot_cols_minus_1, nb_float) + + # allocate local int arrays + # CDF indices via binary search + u_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + v_cdf_indices = cuda.local.array(tot_cols_minus_1, nb_int) + + u_cdf_sorted_cumweights = cuda.local.array( + tot_cols_minus_1, nb_float) + v_cdf_sorted_cumweights = cuda.local.array( + tot_cols_minus_1, nb_float) + + # short-circuit + if presorted and cumweighted and prepended: + u_sorted = u + v_sorted = v + + u_0_cumweights = u_weights + v_0_cumweights = v_weights + + # sorting, accumulating, and prepending (for compatibility) + else: + # check arguments + if not presorted and (cumweighted or prepended): + raise ValueError( + "if cumweighted or prepended are True, then presorted cannot be False") # noqa + + if (not presorted or not cumweighted) and prepended: + raise ValueError( + "if prepended is True, then presorted and cumweighted must both be True") # noqa + + # sorting + if not presorted: + # local arrays + u_sorted = cuda.local.array(cols, np_float) + v_sorted = cuda.local.array(cols, np_float) + + u_sorter = cuda.local.array(cols, nb_int) + v_sorter = cuda.local.array(cols, nb_int) + + u_sorted_weights = cuda.local.array(cols, nb_float) + v_sorted_weights = cuda.local.array(cols, nb_float) + + # local copy since quickArgSortIterative sorts in-place + hp.copy(u, u_sorted) + hp.copy(v, v_sorted) + + # sorting + hp.insertionArgSort(u_sorted, u_sorter) + hp.insertionArgSort(v_sorted, v_sorter) + + # inplace to avoid extra cuda local array + hp.sort_by_indices(u_weights, u_sorter, u_sorted_weights) + hp.sort_by_indices(u_weights, u_sorter, v_sorted_weights) + + # cumulative weights + if not cumweighted: + # local arrays + u_cumweights = cuda.local.array(cols, nb_float) + v_cumweights = cuda.local.array(cols, nb_float) + # accumulate + hp.cumsum(u_sorted_weights, u_cumweights) + hp.cumsum(v_sorted_weights, v_cumweights) + + # prepend weights with zero + if not prepended: + zero = cuda.local.array(1, nb_float) + + u_0_cumweights = cuda.local.array( + cols_plus_1, nb_float) + v_0_cumweights = cuda.local.array( + cols_plus_1, nb_float) + + hp.concatenate(zero, u_cumweights, u_0_cumweights) + hp.concatenate(zero, v_cumweights, v_0_cumweights) + + # concatenate u and v into uv + hp.concatenate(u_sorted, v_sorted, uv) + + # sorting + # quickSortIterative(uv, uv_stack) + hp.insertionSort(uv) + + # Get the respective positions of the values of u and v among the + # values of both distributions. See also np.searchsorted + hp.bisect_right(u_sorted, uv[:-1], u_cdf_indices) + hp.bisect_right(v_sorted, uv[:-1], v_cdf_indices) + + # empirical CDFs + hp.sort_by_indices(u_0_cumweights, u_cdf_indices, + u_cdf_sorted_cumweights) + hp.divide(u_cdf_sorted_cumweights, u_0_cumweights[-1], u_cdf) + + hp.sort_by_indices(v_0_cumweights, v_cdf_indices, + v_cdf_sorted_cumweights) + hp.divide(v_cdf_sorted_cumweights, v_0_cumweights[-1], v_cdf) + + # # Integration + hp.diff(uv, uv_deltas) # See also np.diff + + out = hp.integrate(u_cdf, v_cdf, uv_deltas, p) + + return out + + +@ jit(inline=inline) +def wasserstein_distance(u, v, u_weights, v_weights, presorted, cumweighted, prepended): # noqa + r""" + Compute the first Wasserstein distance between two 1D distributions. + + This distance is also known as the earth mover's distance, since it can be + seen as the minimum amount of "work" required to transform :math:`u` into + :math:`v`, where "work" is measured as the amount of distribution weight + that must be moved, multiplied by the distance it has to be moved. + + Source + ------ + https://github.com/scipy/scipy/blob/47bb6febaa10658c72962b9615d5d5aa2513fa3a/scipy/stats/stats.py#L8245-L8319 # noqa + + Parameters + ---------- + u_values, v_values : array_like + Values observed in the (empirical) distribution. + u_weights, v_weights : array_like, optional + Weight for each value. If unspecified, each value is assigned the same + weight. + `u_weights` (resp. `v_weights`) must have the same length as + `u_values` (resp. `v_values`). If the weight sum differs from 1, it + must still be positive and finite so that the weights can be normalized + to sum to 1. + + Returns + ------- + distance : float + The computed distance between the distributions. + + Notes + ----- + The input distributions can be empirical, therefore coming from samples + whose values are effectively inputs of the function, or they can be seen as + generalized functions, in which case they are weighted sums of Dirac delta + functions located at the specified values. + + References + ---------- + .. [1] Bellemare, Danihelka, Dabney, Mohamed, Lakshminarayanan, Hoyer, + Munos "The Cramer Distance as a Solution to Biased Wasserstein + Gradients" (2017). :arXiv:`1705.10743`. + """ + return cdf_distance(u, v, u_weights, v_weights, np_int(1), presorted, cumweighted, prepended) # noqa + + +@ jit(inline=inline) +def euclidean_distance(a, b): + """ + Calculate Euclidean distance between vectors a and b. + + Parameters + ---------- + a : 1D array + First vector. + b : 1D array + Second vector. + + Returns + ------- + d : numeric scalar + Euclidean distance between vectors a and b. + """ + d = 0 + for i in range(len(a)): + d += (b[i] - a[i]) ** 2 + d = sqrt(d) + return d + + +@ jit(inline=inline) +def compute_distance(u, v, u_weights, v_weights, metric_num): + """ + Calculate weighted distance between two vectors, u and v. + + Parameters + ---------- + u : 1D array of float + First vector. + v : 1D array of float + Second vector. + u_weights : 1D array of float + Weights for u. + v_weights : 1D array of float + Weights for v. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Raises + ------ + NotImplementedError + "Specified metric is mispelled or has not been implemented yet. + If not implemented, consider submitting a pull request." + + Returns + ------- + d : float + Weighted distance between u and v. + + """ + if metric_num == 0: + d = euclidean_distance(u, v) + elif metric_num == 1: + # assume u and v are presorted, and weights are sorted by u and v + d = wasserstein_distance( + u, v, u_weights, v_weights, True, True, True) + else: + raise NotImplementedError( + "Specified metric is mispelled or has not been implemented yet. " + "If not implemented, consider submitting a pull request." + ) + return d + + +@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "int{0}[:,:], float{0}[:], int{0})".format(bits), fastmath=fastmath) +def sparse_distance_matrix(U, V, U_weights, V_weights, pairs, out, metric_num): + """ + Calculate sparse pairwise distances between two sets of vectors for pairs. + + Parameters + ---------- + mat : numeric cuda array + First set of vectors for which to compute a single pairwise distance. + mat2 : numeric cuda array + Second set of vectors for which to compute a single pairwise distance. + pairs : cuda array of 2-tuples + All pairs for which distances are to be computed. + out : numeric cuda array + The initialized array which will be populated with distances. + + Raises + ------ + ValueError + Both matrices should have the same number of columns. + + Returns + ------- + None. + + """ + k = cuda.grid(1) + + npairs = pairs.shape[0] + + if k < npairs: + pair = pairs[k] + # extract indices of the pair for which the distance will be computed + i, j = pair + + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + + out[k] = compute_distance(u, v, uw, vw, metric_num) + + +@cuda.jit( + "void(float{0}[:,:], float{0}[:,:], float{0}[:,:], int{0})".format(bits), + fastmath=fastmath, +) +def one_set_distance_matrix(U, U_weights, out, metric_num): + """ + Calculate pairwise distances within single set of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Initialized matrix to populate with pairwise distances. + metric_num : int + Which metric to use (0 == "euclidean", 1=="wasserstein"). + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + dm_rows = U.shape[0] + dm_cols = U.shape[0] + + if i < j and i < dm_rows and j < dm_cols and i != j: + u = U[i] + v = U[j] + uw = U_weights[i] + vw = U_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + out[j, i] = d + + +# faster compilation *and* runtimes with explicit signature +@cuda.jit("void(float{0}[:,:], float{0}[:,:], float{0}[:,:], float{0}[:,:], " + "float{0}[:,:], int{0})".format(bits), fastmath=fastmath) +def two_set_distance_matrix(U, V, U_weights, V_weights, out, metric_num): + """ + Calculate pairwise distances between two sets of vectors. + + Parameters + ---------- + U : 2D array of float + Vertically stacked vectors. + V : 2D array of float + Vertically stacked vectors. + U_weights : 2D array of float + Vertically stacked weight vectors. + V_weights : 2D array of float + Vertically stacked weight vectors. + out : 2D array of float + Pairwise distance matrix between two sets of vectors. + metric_num : int + Distance metric number {"euclidean": 0, "wasserstein": 1}. + + Returns + ------- + None. + + """ + i, j = cuda.grid(2) + + # distance matrix shape + dm_rows = U.shape[0] + dm_cols = V.shape[0] + + if i < dm_rows and j < dm_cols: + u = U[i] + v = V[j] + uw = U_weights[i] + vw = V_weights[j] + d = compute_distance(u, v, uw, vw, metric_num) + out[i, j] = d + + +def dist_matrix(U, + V=None, + U_weights=None, + V_weights=None, + pairs=None, + metric="euclidean"): # noqa + """ + Compute distance matrices using Numba/CUDA. + + Parameters + ---------- + mat : array + First set of vectors for which to compute pairwise distances. + + mat2 : array, optional + Second set of vectors for which to compute pairwise distances. + If not specified, then mat2 is a copy of mat. + + pairs : array, optional + List of 2-tuples which contain the indices for which to compute + distances. If mat2 was specified, then the second index accesses + mat2 instead of mat. If not specified, then the pairs are + auto-generated. If mat2 was specified, all combinations of the two + vector sets are used. If mat2 isn't specified, then only the upper + triangle (minus diagonal) pairs are computed. + + metric : str, optional + Possible options are 'euclidean', 'wasserstein'. + Defaults to Euclidean distance. These are converted to integers + internally due to Numba's lack of support for string arguments + (2021-08-14). See compute_distance() for other keys. + + For example: + 0 - 'euclidean' + 1 - 'wasserstein' + target : str, optional + Which target to use: "cuda" or "cpu". Default is "cuda". + inline : str, optional + Whether to inline functions: "always" or "never". Default is "never". + fastmath : bool, optional + Whether to use fastmath or not. The default is True. + USE_64 : bool, optional + Whether to use 64 bit or 34 bit types. The default is False. + + Raises + ------ + ValueError + DESCRIPTION. + NotImplementedError + DESCRIPTION. + + Returns + ------- + out : array + A pairwise distance matrix, or if pairs are specified, then a + vector of distances corresponding to the pairs. + + """ + rows, cols_check = U.shape + + if cols_check != cols: + raise KeyError("For performance reasons and architecture constraints " + "the number of columns of U (which is the same as V) " + "must be defined as the environment variable: COLUMNS. " + "However, os.environ[\"COLUMNS\"] does not match the " + "number of columns in U. Reset the environment variable " + "via e.g. `os.environ[\"COLUMNS\"] = str(U.shape[0])` " + "defined at the top of your script and make sure that " + "dist_matrix is reloaded. You may also need to restart " + "Python.") + + if V is not None and cols is not V.shape[1]: + raise ValueError("The number of columns for U and V should match") + + # is it a distance matrix between two sets of vectors? + # (rather than within a single set) + isXY = V is not None + + # were pairs specified? (useful for sparse matrix generation) + pairQ = pairs is not None + + # block, grid, and out shape + if pairQ: + block_dim = TPB + npairs = pairs.shape[0] + grid_dim = ceil(npairs / block_dim) + else: + block_dim = (TPB, TPB) + blockspergrid_x = ceil(rows / block_dim[0]) + if isXY: + blockspergrid_y = ceil(V.shape[0] / block_dim[1]) + else: + blockspergrid_y = ceil(rows / block_dim[1]) + grid_dim = (blockspergrid_x, blockspergrid_y) + + # CUDA setup + stream = cuda.stream() + + # sorting and cumulative weights + if metric == "wasserstein": + # presort values (and weights by sorted value indices) + U_sorter = np.argsort(U) + U = np.take_along_axis(U, U_sorter, axis=-1) + U_weights = np.take_along_axis(U_weights, U_sorter, axis=-1) + + # calculate cumulative weights + U_weights = np.cumsum(U_weights, axis=1) + + # prepend a column of zeros + zero = np.zeros((U_weights.shape[0], 1)) + U_weights = np.column_stack((zero, U_weights)) + + # do the same for V and V_weights + if isXY: + V_sorter = np.argsort(V) + V = np.take_along_axis(V, V_sorter, axis=-1) + V_weights = np.take_along_axis(V_weights, V_sorter, axis=-1) + V_weights = np.cumsum(V_weights, axis=1) + V_weights = np.column_stack((zero, V_weights)) + + # assign dummy arrays + if V is None: + V = np.zeros((2, 2)) + + if U_weights is None: + U_weights = np.zeros((2, 2)) + + if V_weights is None: + V_weights = np.zeros((2, 2)) + + if pairs is None: + pairs = np.zeros((2, 2)) + + # assign metric_num based on specified metric (no string support) + metric_dict = {"euclidean": 0, "wasserstein": 1} + metric_num = metric_dict[metric] + + # # same for target_num + # target_dict = {"cuda": 0, "cpu": 1} + # target_num = target_dict[target] + + m = U.shape[0] + + if isXY: + m2 = V.shape[0] + else: + m2 = m + + if pairQ: + shape = (npairs,) + else: + shape = (m, m2) + + # values should be initialized instead of using cuda.device_array + out = np.zeros(shape) + + if target == "cuda": + # copying/allocating to GPU + cU = cuda.to_device(np.asarray( + U, dtype=np_float), stream=stream) + + cU_weights = cuda.to_device(np.asarray( + U_weights, dtype=np_float), stream=stream) + + if isXY or pairQ: + cV = cuda.to_device(np.asarray( + V, dtype=np_float), stream=stream) + cV_weights = cuda.to_device( + np.asarray(V_weights, dtype=np_float), stream=stream + ) + + if pairQ: + cpairs = cuda.to_device(np.asarray( + pairs, dtype=np_int), stream=stream) + + cuda_out = cuda.to_device(np.asarray( + out, dtype=np_float), stream=stream) + + elif target == "cpu": + cU = U + cV = V + cU_weights = U_weights + cV_weights = V_weights + cpairs = pairs + cuda_out = out + + # distance matrix between two sets of vectors + if isXY and not pairQ: + fn = two_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif not isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU, cU_weights, cU_weights, cpairs, + cuda_out, metric_num) + + # distance matrix within single set of vectors + elif not isXY and not pairQ: + fn = one_set_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cU_weights, cuda_out, metric_num) + + # specified pairwise distances within single set of vectors + elif isXY and pairQ: + fn = sparse_distance_matrix + if target == "cuda": + fn = fn[grid_dim, block_dim] + fn(cU, cV, cU_weights, cV_weights, cpairs, + cuda_out, metric_num) + + out = cuda_out.copy_to_host(stream=stream) + + return out + +# %% Code Graveyard + +# u_stack = cuda.local.array(cols, nb_int) +# v_stack = cuda.local.array(cols, nb_int) + +# copy(u_sorted, utmp) +# copy(v_sorted, vtmp) + +# copy(u_weights, u_sorted_weights) +# copy(v_weights, v_sorted_weights) + +# stack = [0] * size +# ids = list(range(len(arr))) + +# u_sorted_weights = cuda.local.array(cols, nb_float) +# v_sorted_weights = cuda.local.array(cols, nb_float) + +# sorting stack +# uv_stack = cuda.local.array(tot_cols, nb_int) + +# def sort_cum_prepend(X, X_weights): +# # presort values (and weights by sorted value indices) +# X_sorter = np.argsort(X) +# X = np.take_along_axis(X, X_sorter, axis=-1) +# X_weights = np.take_along_axis(X_weights, X_sorter, axis=-1) +# # calculate cumulative weights +# X_weights = np.cumsum(X_weights) +# # prepend a column of zeros +# zero = np.zeros((X_weights.shape[0], 1)) +# X_weights = np.column_stack((zero, X_weights)) + +# def mean_squared_error(y, y_pred, squared=False): +# """ +# Return mean squared error (MSE) without using sklearn. + +# If squared == True, then return root mean squared error (RMSE). + +# Parameters +# ---------- +# y : 1D numeric array +# "True" or first set of values. +# y_pred : 1D numeric array +# "Predicted" or second set of values. + +# Returns +# ------- +# rmse : numeric scalar +# DESCRIPTION. + +# """ +# mse = np.mean((y - y_pred)**2) +# if squared is True: +# rmse = np.sqrt(mse) +# return rmse +# else: +# return mse +# return mse + +# opt = False +# os.environ["NUMBA_CUDA_DEBUGINFO"] = "0" +# opt = True +# os.environ["NUMBA_BOUNDSCHECK"] = "0" +# os.environ["OPT"] = "0" diff --git a/numba/cuda/tests/cudapy/test_dist_matrix.py b/numba/cuda/tests/cudapy/test_dist_matrix.py new file mode 100644 index 00000000000..4ad8f948a16 --- /dev/null +++ b/numba/cuda/tests/cudapy/test_dist_matrix.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +"""Test distance matrix calculations using CUDA/Numba.""" +from scipy.spatial.distance import euclidean, cdist +from scipy.stats import wasserstein_distance as scipy_wasserstein_distance +from numpy.testing import assert_allclose +import numpy as np +import os +from importlib import reload + +# os.environ["NUMBA_DISABLE_JIT"] = "1" + +# number of columns of U and V must be set as env var before import dist_matrix +cols = 100 +os.environ["COLUMNS"] = str(cols) + +# other environment variables (set before importing dist_matrix) +os.environ["USE_64"] = "0" +os.environ["INLINE"] = "never" +os.environ["FASTMATH"] = "1" +os.environ["TARGET"] = "cuda" + +from numba.cuda.kernels.device.helper import Timer # noqa +from numba.cuda.testing import unittest, CUDATestCase # noqa +import numba.cuda.kernels.dist_matrix # noqa + +# to overwrite env vars (source: https://stackoverflow.com/a/1254379/13697228) +reload(numba.cuda.kernels.dist_matrix) +dist_matrix = numba.cuda.kernels.dist_matrix.dist_matrix + +verbose_test = True + + +class TestDistMat(CUDATestCase): + """Test distance matrix calculations on GPU for various metrics.""" + + def test_dist_matrix(self): + """ + Loop through distance metrics and perform unit tests. + + The four test cases are: + pairwise distances within a single set of vectors + pairwise distances between two sets of vectors + sparse pairwise distances within a single set of vectors + sparse pairwise distances between two sets of vectors + + The ground truth for Euclidean comes from cdist. + The ground truth for Earth Mover's distance (1-Wasserstein) is via + a scipy.stats function. + + Helper functions are used to generate test data and support the use of + Wasserstein distances in cdist. + + Returns + ------- + None. + + """ + + def test_data(rows, cols): + """ + Generate seeded test values and weights for two distributions. + + Returns + ------- + U : 2D array + Values of first distribution. + V : 2D array + Values of second distribution. + U_weights : 2D array + Weights of first distribution. + V_weights : 2D array + Weights of second distribution. + + """ + np.random.seed(42) + [U, V, U_weights, V_weights] = [ + np.random.rand(rows, cols) for i in range(4)] + return U, V, U_weights, V_weights + + def my_wasserstein_distance(u_uw, v_vw): + """ + Return Earth Mover's distance using concatenated values and weights. + + Parameters + ---------- + u_uw : 1D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + Returns + ------- + distance : numeric scalar + Earth Mover's distance given two distributions. + + """ + # split into values and weights + n = len(u_uw) + i = n // 2 + u = u_uw[0: i] + uw = u_uw[i: n] + v = v_vw[0: i] + vw = v_vw[i: n] + # calculate distance + distance = scipy_wasserstein_distance( + u, v, u_weights=uw, v_weights=vw) + return distance + + def join_wasserstein(U, V, Uw, Vw): + """ + Horizontally stack values and weights for each distribution. + + Weights are added as additional columns to values. + + Example: + u_uw, v_vw = join_wasserstein(u, v, uw, vw) + d = my_wasserstein_distance(u_uw, v_vw) + cdist(u_uw, v_vw, metric=my_wasserstein_distance) + + Parameters + ---------- + u : 1D or 2D numeric array + First set of distribution values. + v : 1D or 2D numeric array + Second set of values of distribution values. + uw : 1D or 2D numeric array + Weights for first distribution. + vw : 1D or 2D numeric array + Weights for second distribution. + + Returns + ------- + u_uw : 1D or 2D numeric array + Horizontally stacked values and weights of first distribution. + v_vw : TYPE + Horizontally stacked values and weights of second distribution. + + """ + U_Uw = np.concatenate((U, Uw), axis=1) + V_Vw = np.concatenate((V, Vw), axis=1) + return U_Uw, V_Vw + + tol = 1e-5 + + # test and production data + pairs = np.array([(0, 1), (1, 2), (2, 3)]) + i, j = pairs[0] + + for testQ in [True, False]: + # large # of rows with testQ == True is slow due to use of + # cdist and non-jitted scipy-wasserstein + if testQ: + rows = 6 + else: + rows = 1000 + print("====[PARTIAL TEST WITH {} ROWS]====".format(rows)) + U, V, U_weights, V_weights = test_data(rows, cols) + Utest, Vtest, Uwtest, Vwtest = [x[0:6] for x in [U, V, U_weights, V_weights]] # noqa + + for target in ["cuda"]: # "cpu" not implemented + for metric in ['euclidean', 'wasserstein']: + print('[' + target.upper() + "_" + metric.upper() + ']') + + if testQ: + # compile + dist_matrix(Utest, U_weights=Uwtest, metric=metric) + with Timer("one set"): + one_set = dist_matrix( + U, U_weights=U_weights, metric=metric) # noqa + if verbose_test: + print(one_set, '\n') + + # compile + dist_matrix(Utest, V=Vtest, U_weights=Uwtest, + V_weights=Vwtest, metric=metric) + with Timer("two set"): + two_set = dist_matrix(U, V=V, U_weights=U_weights, + V_weights=V_weights, metric=metric) # noqa + if testQ and verbose_test: + print(two_set, '\n') + + one_set_sparse = dist_matrix( + U, U_weights=U_weights, pairs=pairs, metric=metric) # noqa + if testQ and verbose_test: + print(one_set_sparse, '\n') + + two_set_sparse = dist_matrix( + U, + V=V, + U_weights=U_weights, + V_weights=V_weights, + pairs=pairs, + metric=metric, + ) + if testQ and verbose_test: + print(two_set_sparse, '\n') + + if testQ: + if metric == "euclidean": + with Timer("one set check (cdist)"): + one_set_check = cdist(U, U) + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + one_sparse_check = [ + euclidean(U[i], U[j]) for i, j in pairs] + two_sparse_check = [ + euclidean(U[i], V[j]) for i, j in pairs] + + elif metric == "wasserstein": + U_Uw, V_Vw = join_wasserstein( + U, V, U_weights, V_weights) + + with Timer("one set check (cdist)"): + one_set_check = cdist( + U_Uw, U_Uw, metric=my_wasserstein_distance) + with Timer("two set check (cdist)"): + two_set_check = cdist( + U_Uw, V_Vw, metric=my_wasserstein_distance) + + one_sparse_check = [my_wasserstein_distance(U_Uw[i], U_Uw[j]) # noqa + for i, j in pairs] + two_sparse_check = [my_wasserstein_distance(U_Uw[i], V_Vw[j]) # noqa + for i, j in pairs] + + # check results + assert_allclose( + one_set.ravel(), one_set_check.ravel(), rtol=tol, + err_msg="one set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set.ravel(), two_set_check.ravel(), rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + one_set_sparse, one_sparse_check, rtol=tol, + err_msg="one set {} {} sparse distance matrix inaccurate".format(target, metric)) # noqa + assert_allclose( + two_set_sparse, two_sparse_check, rtol=tol, + err_msg="two set {} {} distance matrix inaccurate".format(target, metric)) # noqa + elif metric == "euclidean": + with Timer("two set check (cdist)"): + two_set_check = cdist(U, V) + + +if __name__ == '__main__': + unittest.main() + + +# TF = [ +# np.allclose(one_set, one_set_check), +# np.allclose(two_set, two_set_check), +# np.allclose(one_set_sparse, one_sparse_check), +# np.allclose(two_set_sparse, two_sparse_check) +# ] diff --git a/numba/cuda/tests/cudapy/test_helper.py b/numba/cuda/tests/cudapy/test_helper.py new file mode 100644 index 00000000000..67383ca9093 --- /dev/null +++ b/numba/cuda/tests/cudapy/test_helper.py @@ -0,0 +1,83 @@ +""" +Test helper functions for distance matrix calculations. +""" +import os +import numpy as np +from numba.cuda.testing import unittest, CUDATestCase +import numba.cuda.kernels.device.helper as hp +from numpy.testing import assert_allclose, assert_equal + +bits = int(os.environ.get("MACHINE_BITS", "32")) + +if bits == 32: + np_float = np.float32 + np_int = np.float32 +elif bits == 64: + np_float = np.float64 + np_int = np.float64 + +tol = 1e-5 + + +class TestHelperFunctions(CUDATestCase): + def test_concatenate(self): + vec = np.random.rand(5) + vec2 = np.random.rand(10) + check = np.concatenate((vec, vec2)) + out = np.zeros(len(vec) + len(vec2), dtype=np_float) + hp.concatenate(vec, vec2, out) + + assert_allclose(out, check) + vec[0] = 100.0 + assert_allclose(out, check) + + def test_diff(self): + vec = np.random.rand(10) + out = np.zeros(len(vec) - 1) + hp.diff(vec, out) + check = np.diff(vec) + assert_equal(out, check) + + def test_bisect_right(self): + a = np.random.rand(10) + a.sort() + v = np.random.rand(5) + ids = np.zeros_like(v[:-1], dtype=np_int) + hp.bisect_right(a, v[:-1], ids) + check = np.searchsorted(a, v[:-1], side="right") + assert_equal(ids, check) + + def test_sort_by_indices(self): + v = np.random.rand(10) + ids = np.arange(len(v)) + np.random.shuffle(ids) + out = np.zeros_like(v) + hp.sort_by_indices(v, ids, out) + check = v[ids] + assert_equal(out, check) + + v = np.array([0.0, 0.5528931, 1.1455898, 1.5933731]) + ids = np.array([1, 2, 3, 3, 3]) + out = np.zeros_like(ids, dtype=np_float) + hp.sort_by_indices(v, ids, out) + check = v[ids] + assert_allclose(out, check) + + def test_cumsum(self): + vec = np.random.rand(10) + out = np.zeros_like(vec, dtype=np_float) + hp.cumsum(vec, out) + check = np.cumsum(vec) + assert_allclose(out, check) + + def test_divide(self): + v = np.random.rand(10) + b = 4.62 + out = np.zeros_like(v, dtype=np_float) + hp.divide(v, b, out) + check = v / b + assert_allclose(out, check) + + +if __name__ == '__main__': + unittest.main() diff --git a/numba/tests/test_help.py b/numba/tests/test_help.py index a78f05f0a3c..8199d4c93c4 100644 --- a/numba/tests/test_help.py +++ b/numba/tests/test_help.py @@ -89,4 +89,4 @@ def test_inspect_cli(self): with self.assertRaises(subprocess.CalledProcessError) as raises: subprocess.check_output(cmds, stderr=subprocess.STDOUT) self.assertIn("\'foo\' is not supported", - raises.exception.stdout.decode('ascii')) + raises.exception.stdout.decode())