Skip to content

use rapidhash #57509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 48 commits into from
May 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
a4b0d2a
first implementation
adienes Feb 23, 2025
2677576
minor fix
adienes Feb 23, 2025
75c0de9
typo fix
adienes Feb 23, 2025
9273153
fix a few more tests
adienes Feb 23, 2025
f08ad3b
more `+` to `\xor` and `- 3h` to `, h`
adienes Feb 24, 2025
7b3a1bf
try feistel for h64
adienes Feb 25, 2025
17322a2
change name back
adienes Feb 27, 2025
948a56e
apply some review comments
adienes Feb 27, 2025
19dbcb4
add :terminates_globally effect
adienes Feb 27, 2025
8ab91a2
simpler loop condition
adienes Feb 28, 2025
197c734
Merge branch 'master' into rapid_hash
adienes Feb 28, 2025
c9ba0a9
clean doctest
adienes Mar 4, 2025
46e0d7a
Merge branch 'master' into rapid_hash
adienes Mar 4, 2025
240d84b
change message
adienes Mar 4, 2025
9e955e1
off by one
adienes Mar 4, 2025
06106ec
Merge branch 'master' into rapid_hash
adienes Mar 4, 2025
5ace2c5
off byones
adienes Mar 5, 2025
5410e09
clean up arith
adienes Mar 5, 2025
62f5904
fix show test again
adienes Mar 5, 2025
eca7d4b
Merge branch 'master' into rapid_hash
adienes Mar 7, 2025
b41e6b9
friendlier to 32bit systems
adienes Mar 7, 2025
8c8ac45
uint32 for substring hash
adienes Mar 7, 2025
30a10ce
Merge branch 'master' into rapid_hash
adienes Mar 15, 2025
b871832
Merge branch 'master' into rapid_hash
adienes Mar 19, 2025
15eb0f9
re-change float
adienes Mar 19, 2025
2fa6112
Merge branch 'master' into rapid_hash
adienes Mar 28, 2025
ae7bee8
slightly narrow scope of PR; restore x-3h mixing
adienes Mar 28, 2025
c9cd088
fix merge with doctests
adienes Mar 28, 2025
8cd46ce
fix 32bit, again, hopefully?
adienes Mar 28, 2025
f55bb35
ugh
adienes Mar 28, 2025
66734fd
Merge branch 'master' into rapid_hash
adienes Apr 5, 2025
f1ed02d
udpate comment
adienes Apr 5, 2025
bfb1138
try a simpler finalizer and remove Base qualifier
adienes Apr 6, 2025
dd0e1bc
last update to finalizer
adienes Apr 9, 2025
26934c3
Merge branch 'master' into rapid_hash
adienes Apr 9, 2025
b23d89e
Merge branch 'master' into rapid_hash
adienes Apr 11, 2025
848b444
Update base/gmp.jl
adienes Apr 24, 2025
84ababc
Merge branch 'master' into rapid_hash
adienes Apr 27, 2025
9b00ba2
Merge branch 'rapid_hash' of https://github.com/adienes/julia into ra…
adienes Apr 27, 2025
ce88854
apply review
adienes Apr 27, 2025
d6b75a8
add NEWS
adienes Apr 27, 2025
b51561f
Merge branch 'master' into rapid_hash
oscardssmith Apr 28, 2025
574cb15
re-fix test
adienes Apr 29, 2025
19b1eb1
Merge branch 'master' into rapid_hash
giordano May 5, 2025
fc6c22c
Merge branch 'master' into rapid_hash
adienes May 7, 2025
a5f44c2
32-bit showstr(::Set) test
adienes May 7, 2025
42076d9
TOML tests...
adienes May 7, 2025
98512ca
Merge branch 'master' into rapid_hash
IanButterworth May 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Language changes
* `mod(x::AbstractFloat, -Inf)` now returns `x` (as long as `x` is finite), this aligns with C standard and
is considered a bug fix ([#47102])

- The `hash` algorithm and its values have changed. Most `hash` specializations will remain correct and require no action. Types that reimplement the core hashing logic independently, such as some third-party string packages do, may require a migration to the new algorithm. ([#57509])

Compiler/Runtime improvements
-----------------------------

Expand Down
2 changes: 1 addition & 1 deletion base/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3563,7 +3563,7 @@ sizehint!(a::AbstractVector, _) = a

const hash_abstractarray_seed = UInt === UInt64 ? 0x7e2d6fb6448beb77 : 0xd4514ce5
function hash(A::AbstractArray, h::UInt)
h += hash_abstractarray_seed
h = hash_abstractarray_seed
# Axes are themselves AbstractArrays, so hashing them directly would stack overflow
# Instead hash the tuple of firsts and lasts along each dimension
h = hash(map(first, axes(A)), h)
Expand Down
2 changes: 1 addition & 1 deletion base/binaryplatforms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ end

# Hash definition to ensure that it's stable
function Base.hash(p::Platform, h::UInt)
h += 0x506c6174666f726d % UInt
h = 0x506c6174666f726d % UInt
h = hash(p.tags, h)
h = hash(p.compare_strategies, h)
return h
Expand Down
2 changes: 1 addition & 1 deletion base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ in(x::AbstractChar, y::AbstractChar) = x == y
==(x::Char, y::Char) = bitcast(UInt32, x) == bitcast(UInt32, y)
isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y)
hash(x::Char, h::UInt) =
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h))
hash_finalizer(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) % UInt

# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
Expand Down
4 changes: 2 additions & 2 deletions base/gmp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ if Limb === UInt64 === UInt
return hash(ldexp(flipsign(Float64(limb), sz), pow), h)
end
h = hash_integer(pow, h)
h ⊻= hash_uint(flipsign(limb, sz) ⊻ h)
h ⊻= hash_finalizer(flipsign(limb, sz) ⊻ h)
for idx = idx+1:asz
if shift == 0
limb = unsafe_load(ptr, idx)
Expand All @@ -906,7 +906,7 @@ if Limb === UInt64 === UInt
limb = limb2 << upshift | limb1 >> shift
end
end
h ⊻= hash_uint(limb ⊻ h)
h ⊻= hash_finalizer(limb ⊻ h)
end
return h
end
Expand Down
193 changes: 127 additions & 66 deletions base/hashing.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license

## hashing a single value ##
const HASH_SEED = UInt == UInt64 ? 0xbdd89aa982704029 : 0xeabe9406
const HASH_SECRET = tuple(
0x2d358dccaa6c78a5,
0x8bb84b93962eacc9,
0x4b33a62ed433d4a3,
)

"""
hash(x[, h::UInt])::UInt
Expand All @@ -17,75 +22,52 @@ The hash value may change when a new Julia process is started.

```jldoctest; filter = r"0x[0-9a-f]{16}"
julia> a = hash(10)
0x95ea2955abd45275
0x759d18cc5346a65f

julia> hash(10, a) # only use the output of another hash function as the second argument
0xd42bad54a8575b16
0x03158cd61b1b0bd1
```

See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref).
"""
hash(x::Any) = hash(x, zero(UInt))
hash(data::Any) = hash(data, HASH_SEED)
hash(w::WeakRef, h::UInt) = hash(w.value, h)

# Types can't be deleted, so marking as total allows the compiler to look up the hash
hash(T::Type, h::UInt) = hash_uint(3h - @assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T))
hash(T::Type, h::UInt) =
hash((@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h)
hash(@nospecialize(data), h::UInt) = hash(objectid(data), h)

## hashing general objects ##

hash(@nospecialize(x), h::UInt) = hash_uint(3h - objectid(x))

hash(x::Symbol) = objectid(x)

## core data hashing functions ##

function hash_64_64(n::UInt64)
a::UInt64 = n
a = ~a + a << 21
a = a ⊻ a >> 24
a = a + a << 3 + a << 8
a = a ⊻ a >> 14
a = a + a << 2 + a << 4
a = a ⊻ a >> 28
a = a + a << 31
return a
function mul_parts(a::UInt64, b::UInt64)
p = widemul(a, b)
return (p >> 64) % UInt64, p % UInt64
end

function hash_64_32(n::UInt64)
a::UInt64 = n
a = ~a + a << 18
a = a ⊻ a >> 31
a = a * 21
a = a ⊻ a >> 11
a = a + a << 6
a = a ⊻ a >> 22
return a % UInt32
hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...)

# faster-but-weaker than hash_mix intended for small keys
hash_mix_linear(x::UInt64, h::UInt) = 3h - x
function hash_finalizer(x::UInt64)
x ⊻= (x >> 32)
x *= 0x63652a4cd374b267
x ⊻= (x >> 33)
return x
end

function hash_32_32(n::UInt32)
a::UInt32 = n
a = a + 0x7ed55d16 + a << 12
a = a ⊻ 0xc761c23c ⊻ a >> 19
a = a + 0x165667b1 + a << 5
a = a + 0xd3a2646c ⊻ a << 9
a = a + 0xfd7046c5 + a << 3
a = a ⊻ 0xb55a4f09 ⊻ a >> 16
return a
end
hash_64_64(data::UInt64) = hash_finalizer(data)
hash_64_32(data::UInt64) = hash_64_64(data) % UInt32
hash_32_32(data::UInt32) = hash_64_32(UInt64(data))

if UInt === UInt64
hash_uint64(x::UInt64) = hash_64_64(x)
hash_uint(x::UInt) = hash_64_64(x)
const hash_uint64 = hash_64_64
const hash_uint = hash_64_64
else
hash_uint64(x::UInt64) = hash_64_32(x)
hash_uint(x::UInt) = hash_32_32(x)
const hash_uint64 = hash_64_32
const hash_uint = hash_32_32
end

## efficient value-based hashing of integers ##

hash(x::Int64, h::UInt) = hash_uint64(bitcast(UInt64, x)) - 3h
hash(x::UInt64, h::UInt) = hash_uint64(x) - 3h
hash(x::Union{Bool,Int8,UInt8,Int16,UInt16,Int32,UInt32}, h::UInt) = hash(Int64(x), h)
hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h))
hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h)
hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h)

function hash_integer(n::Integer, h::UInt)
h ⊻= hash_uint((n % UInt) ⊻ h)
Expand All @@ -100,7 +82,7 @@ end

## efficient value-based hashing of floats ##

const hx_NaN = hash_uint64(reinterpret(UInt64, NaN))
const hx_NaN = hash(reinterpret(UInt64, NaN))
function hash(x::Float64, h::UInt)
# see comments on trunc and hash(Real, UInt)
if typemin(Int64) <= x < typemax(Int64)
Expand All @@ -116,7 +98,7 @@ function hash(x::Float64, h::UInt)
elseif isnan(x)
return hx_NaN ⊻ h # NaN does not have a stable bit pattern
end
return hash_uint64(bitcast(UInt64, x)) - 3h
return hash(bitcast(UInt64, x), h)
end

hash(x::Float32, h::UInt) = hash(Float64(x), h)
Expand All @@ -131,7 +113,7 @@ function hash(x::Float16, h::UInt)
elseif isnan(x)
return hx_NaN ⊻ h # NaN does not have a stable bit pattern
end
return hash_uint64(bitcast(UInt64, Float64(x))) - 3h
return hash(bitcast(UInt64, Float64(x)), h)
end

## generic hashing for rational values ##
Expand Down Expand Up @@ -180,21 +162,100 @@ end


## symbol & expression hashing ##

if UInt === UInt64
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x83c7900696d26dc6))
hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x2c97bf8b3de87020)
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h 0x83c7900696d26dc6))
hash(x::QuoteNode, h::UInt) = hash(x.value, h 0x2c97bf8b3de87020)
else
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x96d26dc6))
hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af)
hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h ⊻ 0x469d72af))
hash(x::QuoteNode, h::UInt) = hash(x.value, h 0x469d72af)
end

## hashing strings ##
hash(x::Symbol) = objectid(x)

const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed
const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81

@assume_effects :total function hash(s::String, h::UInt)
h += memhash_seed
ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h
load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} =
unsafe_load(convert(Ptr{T}, ptr + i - 1))

function read_small(ptr::Ptr{UInt8}, n::Int)
return (UInt64(unsafe_load(ptr)) << 56) |
(UInt64(unsafe_load(ptr, div(n, 2) + 1)) << 32) |
UInt64(unsafe_load(ptr, n))
end

@assume_effects :terminates_globally function hash_bytes(
ptr::Ptr{UInt8},
n::Int,
seed::UInt64,
secret::NTuple{3, UInt64}
)
# Adapted with gratitude from [rapidhash](https://github.com/Nicoshev/rapidhash)
buflen = UInt64(n)
seed = seed ⊻ (hash_mix(seed ⊻ secret[1], secret[2]) ⊻ buflen)

a = zero(UInt64)
b = zero(UInt64)

if buflen ≤ 16
if buflen ≥ 4
a = (UInt64(load_le(UInt32, ptr, 1)) << 32) |
UInt64(load_le(UInt32, ptr, n - 3))

delta = (buflen & 24) >>> (buflen >>> 3)
b = (UInt64(load_le(UInt32, ptr, delta + 1)) << 32) |
UInt64(load_le(UInt32, ptr, n - 3 - delta))
elseif buflen > 0
a = read_small(ptr, n)
end
else
pos = 1
i = buflen
while i ≥ 48
see1 = seed
see2 = seed
while i ≥ 48
seed = hash_mix(
load_le(UInt64, ptr, pos) ⊻ secret[1],
load_le(UInt64, ptr, pos + 8) ⊻ seed
)
see1 = hash_mix(
load_le(UInt64, ptr, pos + 16) ⊻ secret[2],
load_le(UInt64, ptr, pos + 24) ⊻ see1
)
see2 = hash_mix(
load_le(UInt64, ptr, pos + 32) ⊻ secret[3],
load_le(UInt64, ptr, pos + 40) ⊻ see2
)
pos += 48
i -= 48
end
seed = seed ⊻ see1 ⊻ see2
end
if i > 16
seed = hash_mix(
load_le(UInt64, ptr, pos) ⊻ secret[3],
load_le(UInt64, ptr, pos + 8) ⊻ seed ⊻ secret[2]
)
if i > 32
seed = hash_mix(
load_le(UInt64, ptr, pos + 16) ⊻ secret[3],
load_le(UInt64, ptr, pos + 24) ⊻ seed
)
end
end

a = load_le(UInt64, ptr, n - 15)
b = load_le(UInt64, ptr, n - 7)
end

a = a ⊻ secret[2]
b = b ⊻ seed
b, a = mul_parts(a, b)
return hash_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2])
end

@assume_effects :total hash(data::String, h::UInt) =
GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt

# no longer used in Base, but a lot of packages access these internals
const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed
const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81
2 changes: 1 addition & 1 deletion base/multidimensional.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ module IteratorsMD
# hashing
const cartindexhash_seed = UInt == UInt64 ? 0xd60ca92f8284b8b0 : 0xf2ea7c2e
function Base.hash(ci::CartesianIndex, h::UInt)
h += cartindexhash_seed
h = cartindexhash_seed
for i in ci.I
h = hash(i, h)
end
Expand Down
2 changes: 1 addition & 1 deletion base/pkgid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ end
==(a::PkgId, b::PkgId) = a.uuid == b.uuid && a.name == b.name

function hash(pkg::PkgId, h::UInt)
h += 0xc9f248583a0ca36c % UInt
h = 0xc9f248583a0ca36c % UInt
h = hash(pkg.uuid, h)
h = hash(pkg.name, h)
return h
Expand Down
2 changes: 1 addition & 1 deletion base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -802,7 +802,7 @@ end
## hash ##
const hashre_seed = UInt === UInt64 ? 0x67e195eb8555e72d : 0xe32373e4
function hash(r::Regex, h::UInt)
h += hashre_seed
h = hashre_seed
h = hash(r.pattern, h)
h = hash(r.compile_options, h)
h = hash(r.match_options, h)
Expand Down
2 changes: 1 addition & 1 deletion base/stacktraces.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ function ==(a::StackFrame, b::StackFrame)
end

function hash(frame::StackFrame, h::UInt)
h += 0xf4fbda67fe20ce88 % UInt
h = 0xf4fbda67fe20ce88 % UInt
h = hash(frame.line, h)
h = hash(frame.file, h)
h = hash(frame.func, h)
Expand Down
6 changes: 2 additions & 4 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,8 @@ end
pointer(x::SubString{String}) = pointer(x.string) + x.offset
pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)

function hash(s::SubString{String}, h::UInt)
h += memhash_seed
ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h
end
hash(data::SubString{String}, h::UInt) =
GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt

_isannotated(::SubString{T}) where {T} = _isannotated(T)

Expand Down
4 changes: 2 additions & 2 deletions base/tuple.jl
Original file line number Diff line number Diff line change
Expand Up @@ -576,10 +576,10 @@ function _eq(t1::Any32, t2::Any32)
end

const tuplehash_seed = UInt === UInt64 ? 0x77cfa1eef01bca90 : 0xf01bca90
hash(::Tuple{}, h::UInt) = h + tuplehash_seed
hash(::Tuple{}, h::UInt) = h tuplehash_seed
hash(t::Tuple, h::UInt) = hash(t[1], hash(tail(t), h))
function hash(t::Any32, h::UInt)
out = h + tuplehash_seed
out = h tuplehash_seed
for i = length(t):-1:1
out = hash(t[i], out)
end
Expand Down
2 changes: 1 addition & 1 deletion base/version.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ function isless(a::VersionNumber, b::VersionNumber)
end

function hash(v::VersionNumber, h::UInt)
h += 0x8ff4ffdb75f9fede % UInt
h = 0x8ff4ffdb75f9fede % UInt
h = hash(v.major, h)
h = hash(v.minor, h)
h = hash(v.patch, h)
Expand Down
Loading