From a4b0d2a82dd04b0876e796cb0b2997db62540bc4 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 23 Feb 2025 12:43:00 -0500 Subject: [PATCH 01/32] first implementation --- base/char.jl | 2 +- base/float.jl | 6 +- base/gmp.jl | 10 +-- base/hashing.jl | 176 +++++++++++++++++++++++++------------- base/strings/substring.jl | 6 +- test/hashing.jl | 3 + 6 files changed, 132 insertions(+), 71 deletions(-) diff --git a/base/char.jl b/base/char.jl index c089262ebf779..bcf4ab7dc4e1b 100644 --- a/base/char.jl +++ b/base/char.jl @@ -222,7 +222,7 @@ in(x::AbstractChar, y::AbstractChar) = x == y ==(x::Char, y::Char) = bitcast(UInt32, x) == bitcast(UInt32, y) isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y) hash(x::Char, h::UInt) = - hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) + hash(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8 first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char) diff --git a/base/float.jl b/base/float.jl index 54e232a01b8cb..d2d68de0ef63f 100644 --- a/base/float.jl +++ b/base/float.jl @@ -720,7 +720,7 @@ See also: [`Inf`](@ref), [`iszero`](@ref), [`isfinite`](@ref), [`isnan`](@ref). isinf(x::Real) = !isnan(x) & !isfinite(x) isinf(x::IEEEFloat) = abs(x) === oftype(x, Inf) -const hx_NaN = hash_uint64(reinterpret(UInt64, NaN)) +const hx_NaN = hash(reinterpret(UInt64, NaN)) function hash(x::Float64, h::UInt) # see comments on trunc and hash(Real, UInt) if typemin(Int64) <= x < typemax(Int64) @@ -736,7 +736,7 @@ function hash(x::Float64, h::UInt) elseif isnan(x) return hx_NaN ⊻ h # NaN does not have a stable bit pattern end - return hash_uint64(bitcast(UInt64, x)) - 3h + return hash(bitcast(UInt64, x)) - 3h end hash(x::Float32, h::UInt) = hash(Float64(x), h) @@ -751,7 +751,7 @@ function hash(x::Float16, h::UInt) elseif isnan(x) return hx_NaN ⊻ h # NaN does not have a stable bit pattern end - return hash_uint64(bitcast(UInt64, Float64(x))) - 3h + return hash(bitcast(UInt64, Float64(x))) - 3h end ## generic hashing for rational values ## diff --git a/base/gmp.jl b/base/gmp.jl index 97488551f60f6..224b8e8b92e6c 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -849,7 +849,7 @@ if Limb === UInt64 === UInt # an optimized version for BigInt of hash_integer (used e.g. for Rational{BigInt}), # and of hash - using .Base: hash_uint + using .Base: hash function hash_integer(n::BigInt, h::UInt) GC.@preserve n begin @@ -857,9 +857,9 @@ if Limb === UInt64 === UInt s == 0 && return hash_integer(0, h) p = convert(Ptr{UInt64}, n.d) b = unsafe_load(p) - h ⊻= hash_uint(ifelse(s < 0, -b, b) ⊻ h) + h ⊻= hash(ifelse(s < 0, -b, b) ⊻ h) for k = 2:abs(s) - h ⊻= hash_uint(unsafe_load(p, k) ⊻ h) + h ⊻= hash(unsafe_load(p, k) ⊻ h) end return h end @@ -893,7 +893,7 @@ if Limb === UInt64 === UInt return hash(ldexp(flipsign(Float64(limb), sz), pow), h) end h = hash_integer(pow, h) - h ⊻= hash_uint(flipsign(limb, sz) ⊻ h) + h ⊻= hash(flipsign(limb, sz) ⊻ h) for idx = idx+1:asz if shift == 0 limb = unsafe_load(ptr, idx) @@ -907,7 +907,7 @@ if Limb === UInt64 === UInt limb = limb2 << upshift | limb1 >> shift end end - h ⊻= hash_uint(limb ⊻ h) + h ⊻= hash(limb ⊻ h) end return h end diff --git a/base/hashing.jl b/base/hashing.jl index d4a6217de6edb..aae3c13009526 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -25,80 +25,134 @@ julia> hash(10, a) # only use the output of another hash function as the second See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). """ -hash(x::Any) = hash(x, zero(UInt)) -hash(w::WeakRef, h::UInt) = hash(w.value, h) - -# Types can't be deleted, so marking as total allows the compiler to look up the hash -hash(T::Type, h::UInt) = hash_uint(3h - @assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)) - -## hashing general objects ## - -hash(@nospecialize(x), h::UInt) = hash_uint(3h - objectid(x)) +const RAPID_SEED = UInt64(0xbdd89aa982704029) +const RAPID_SECRET = tuple( + 0x2d358dccaa6c78a5, + 0x8bb84b93962eacc9, + 0x4b33a62ed433d4a3, +) + +mul_hi64(A::UInt64, B::UInt64) = ((widen(A) * B) >> 64) % UInt64 +rapid_mix(A, B) = mul_hi64(A, B) ⊻ (A * B) + +load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} = unsafe_load(Ptr{T}(ptr + i - 1)) + +function read_small(ptr::Ptr{UInt8}, n::Int) + return (UInt64(unsafe_load(ptr)) << 56) | + (UInt64(unsafe_load(ptr + div(n, 2))) << 32) | + UInt64(unsafe_load(ptr + n - 1)) +end -hash(x::Symbol) = objectid(x) +function hash( + ptr::Ptr{UInt8}, + n::Int, + seed::UInt64, + secret::NTuple{3, UInt64} + ) + buflen = UInt64(n) + seed = seed ⊻ (rapid_mix(seed ⊻ secret[1], secret[2]) ⊻ buflen) + + a = zero(UInt64) + b = zero(UInt64) + + if buflen ≤ 16 + if buflen ≥ 4 + a = (UInt64(load_le(UInt32, ptr, 1)) << 32) | + UInt64(load_le(UInt32, ptr, n - 4 + 1)) + + delta = (buflen & 24) >>> (buflen >>> 3) + b = (UInt64(load_le(UInt32, ptr, delta + 1)) << 32) | + UInt64(load_le(UInt32, ptr, n - 4 - delta + 1)) + elseif buflen > 0 + a = read_small(ptr, n) + end + else + pos = 1 + i = buflen + if i > 48 + see1 = seed + see2 = seed + while i ≥ 48 + seed = rapid_mix( + load_le(UInt64, ptr, pos) ⊻ secret[1], + load_le(UInt64, ptr, pos + 8) ⊻ seed + ) + see1 = rapid_mix( + load_le(UInt64, ptr, pos + 16) ⊻ secret[2], + load_le(UInt64, ptr, pos + 24) ⊻ see1 + ) + see2 = rapid_mix( + load_le(UInt64, ptr, pos + 32) ⊻ secret[3], + load_le(UInt64, ptr, pos + 40) ⊻ see2 + ) + pos += 48 + i -= 48 + end + seed = seed ⊻ see1 ⊻ see2 + end + if i > 16 + seed = rapid_mix( + load_le(UInt64, ptr, pos) ⊻ secret[3], + load_le(UInt64, ptr, pos + 8) ⊻ seed ⊻ secret[2] + ) + if i > 32 + seed = rapid_mix( + load_le(UInt64, ptr, pos + 16) ⊻ secret[3], + load_le(UInt64, ptr, pos + 24) ⊻ seed + ) + end + end + + a = load_le(UInt64, ptr, n - 17) + b = load_le(UInt64, ptr, n - 9) + end -## core data hashing functions ## - -function hash_64_64(n::UInt64) - a::UInt64 = n - a = ~a + a << 21 - a = a ⊻ a >> 24 - a = a + a << 3 + a << 8 - a = a ⊻ a >> 14 - a = a + a << 2 + a << 4 - a = a ⊻ a >> 28 - a = a + a << 31 - return a + a = a ⊻ secret[2] + b = b ⊻ seed + a, b = a * b, mul_hi64(a, b) + return rapid_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2]) end -function hash_64_32(n::UInt64) - a::UInt64 = n - a = ~a + a << 18 - a = a ⊻ a >> 31 - a = a * 21 - a = a ⊻ a >> 11 - a = a + a << 6 - a = a ⊻ a >> 22 - return a % UInt32 -end -function hash_32_32(n::UInt32) - a::UInt32 = n - a = a + 0x7ed55d16 + a << 12 - a = a ⊻ 0xc761c23c ⊻ a >> 19 - a = a + 0x165667b1 + a << 5 - a = a + 0xd3a2646c ⊻ a << 9 - a = a + 0xfd7046c5 + a << 3 - a = a ⊻ 0xb55a4f09 ⊻ a >> 16 - return a +function hash_64_64(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) + seed = seed ⊻ (rapid_mix(seed ⊻ secret[1], secret[2]) ⊻ 8) + + a = (UInt64(bswap((data >>> 32) % UInt32)) << 32) | UInt64(bswap(data % UInt32)) + b = (a << 32) | (a >>> 32) + a = a ⊻ secret[2] + b = b ⊻ seed + a, b = a * b, mul_hi64(a, b) + return rapid_mix(a ⊻ secret[1] ⊻ 8, b ⊻ secret[2]) end +hash_64_32(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) = + hash_64_64(data, seed, secret) % UInt32 +hash_32_32(data::UInt32, seed::UInt64, secret::NTuple{3, UInt64}) = + hash_64_32(UInt64(data), seed, secret) if UInt === UInt64 - hash_uint64(x::UInt64) = hash_64_64(x) - hash_uint(x::UInt) = hash_64_64(x) + const hash_uint64 = hash_64_64 + const hash_uint = hash_64_64 else - hash_uint64(x::UInt64) = hash_64_32(x) - hash_uint(x::UInt) = hash_32_32(x) + const hash_uint64 = hash_64_32 + const hash_uint = hash_32_32 end -## efficient value-based hashing of integers ## +hash(x::UInt64, h::UInt) = hash_uint64(x, h, RAPID_SECRET) +hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) +hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) -hash(x::Int64, h::UInt) = hash_uint64(bitcast(UInt64, x)) - 3h -hash(x::UInt64, h::UInt) = hash_uint64(x) - 3h -hash(x::Union{Bool,Int8,UInt8,Int16,UInt16,Int32,UInt32}, h::UInt) = hash(Int64(x), h) function hash_integer(n::Integer, h::UInt) - h ⊻= hash_uint((n % UInt) ⊻ h) + h ⊻= hash((n % UInt) ⊻ h) n = abs(n) n >>>= sizeof(UInt) << 3 while n != 0 - h ⊻= hash_uint((n % UInt) ⊻ h) + h ⊻= hash((n % UInt) ⊻ h) n >>>= sizeof(UInt) << 3 end return h end -## symbol & expression hashing ## if UInt === UInt64 hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x83c7900696d26dc6)) @@ -108,12 +162,18 @@ else hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af) end -## hashing strings ## -const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed -const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81 +# hash(data::Char, h::UInt64) = hash(UInt(Base.bitcast(UInt32, data)), h) +hash(data::String, h::UInt64) = GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) + -@assume_effects :total function hash(s::String, h::UInt) - h += memhash_seed - ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h +hash(w::WeakRef, h::UInt64) = rapid(w.value, h) +function hash(T::Type, h::UInt64) + return hash((Base.@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h) end + +hash(x::Symbol) = objectid(x) + +# generic dispatch +hash(data) = hash(data, RAPID_SEED) +hash(@nospecialize(data), h::UInt64) = hash(objectid(data), h) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 50717d3c27e23..8679864ef422e 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -135,10 +135,8 @@ end pointer(x::SubString{String}) = pointer(x.string) + x.offset pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) -function hash(s::SubString{String}, h::UInt) - h += memhash_seed - ccall(memhash, UInt, (Ptr{UInt8}, Csize_t, UInt32), s, sizeof(s), h % UInt32) + h -end +hash(data::SubString{String}, h::UInt64) = + GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) _isannotated(::SubString{T}) where {T} = _isannotated(T) diff --git a/test/hashing.jl b/test/hashing.jl index 173a31d10a6a9..56d62d9d07de7 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -109,6 +109,9 @@ vals = Any[ ] for a in vals, b in vals + println(a, ' ', typeof(a), ' ', objectid(a), ' ', hash(a)) + println(b, ' ', typeof(b), ' ', objectid(b), ' ', hash(b)) + println() @test isequal(a,b) == (hash(a)==hash(b)) end From 2677576e2781f91e7a2e56f28ea9039fe3967d2a Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 23 Feb 2025 13:33:21 -0500 Subject: [PATCH 02/32] minor fix --- base/hashing.jl | 7 ++++--- test/hashing.jl | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index aae3c13009526..d6f1f5fc2766b 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -35,12 +35,13 @@ const RAPID_SECRET = tuple( mul_hi64(A::UInt64, B::UInt64) = ((widen(A) * B) >> 64) % UInt64 rapid_mix(A, B) = mul_hi64(A, B) ⊻ (A * B) -load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} = unsafe_load(Ptr{T}(ptr + i - 1)) +load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} = + unsafe_load(convert(Ptr{T}, ptr + i - 1)) function read_small(ptr::Ptr{UInt8}, n::Int) return (UInt64(unsafe_load(ptr)) << 56) | - (UInt64(unsafe_load(ptr + div(n, 2))) << 32) | - UInt64(unsafe_load(ptr + n - 1)) + (UInt64(unsafe_load(ptr, div(n + 1, 2))) << 32) | + UInt64(unsafe_load(ptr, n)) end function hash( diff --git a/test/hashing.jl b/test/hashing.jl index 56d62d9d07de7..fed7f37117d8c 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -109,9 +109,6 @@ vals = Any[ ] for a in vals, b in vals - println(a, ' ', typeof(a), ' ', objectid(a), ' ', hash(a)) - println(b, ' ', typeof(b), ' ', objectid(b), ' ', hash(b)) - println() @test isequal(a,b) == (hash(a)==hash(b)) end @@ -252,7 +249,9 @@ end ) for a in vals, b in vals - @test isequal(a, b) == (Base.hash_64_32(a) == Base.hash_64_32(b)) + ha = Base.hash_64_32(a, Base.RAPID_SEED, Base.RAPID_SECRET) + hb = Base.hash_64_32(b, Base.RAPID_SEED, Base.RAPID_SECRET) + @test isequal(a, b) == (ha == hb) end end @@ -263,7 +262,9 @@ end ) for a in vals, b in vals - @test isequal(a, b) == (Base.hash_32_32(a) == Base.hash_32_32(b)) + ha = Base.hash_32_32(a, Base.RAPID_SEED, Base.RAPID_SECRET) + hb = Base.hash_32_32(b, Base.RAPID_SEED, Base.RAPID_SECRET) + @test isequal(a, b) == (ha == hb) end end end From 75c0de9540c1ffbc1617687b84e9154e87c2e91e Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 23 Feb 2025 13:40:14 -0500 Subject: [PATCH 03/32] typo fix --- base/hashing.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index d6f1f5fc2766b..963b6d33c90df 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -142,7 +142,6 @@ hash(x::UInt64, h::UInt) = hash_uint64(x, h, RAPID_SECRET) hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) - function hash_integer(n::Integer, h::UInt) h ⊻= hash((n % UInt) ⊻ h) n = abs(n) @@ -164,11 +163,9 @@ else end -# hash(data::Char, h::UInt64) = hash(UInt(Base.bitcast(UInt32, data)), h) hash(data::String, h::UInt64) = GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) - -hash(w::WeakRef, h::UInt64) = rapid(w.value, h) +hash(w::WeakRef, h::UInt64) = hash(w.value, h) function hash(T::Type, h::UInt64) return hash((Base.@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h) end From 92731531777c64df2d5dc538a6062b7288b1b40a Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 23 Feb 2025 17:02:22 -0500 Subject: [PATCH 04/32] fix a few more tests --- base/hashing.jl | 30 ++++++++++++++---------------- base/multidimensional.jl | 2 +- base/tuple.jl | 4 ++-- stdlib/TOML/test/print.jl | 2 +- test/arrayops.jl | 2 +- test/show.jl | 2 +- test/tuple.jl | 8 ++++---- 7 files changed, 24 insertions(+), 26 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 963b6d33c90df..514feecd785b0 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -1,6 +1,11 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -## hashing a single value ## +const RAPID_SEED = UInt64(0xbdd89aa982704029) +const RAPID_SECRET = tuple( + 0x2d358dccaa6c78a5, + 0x8bb84b93962eacc9, + 0x4b33a62ed433d4a3, +) """ hash(x[, h::UInt]) -> UInt @@ -17,20 +22,17 @@ The hash value may change when a new Julia process is started. ```jldoctest julia> a = hash(10) -0x95ea2955abd45275 +0x64abb6a0b5357846 julia> hash(10, a) # only use the output of another hash function as the second argument -0xd42bad54a8575b16 +0xeb3ffc597ad4eafd ``` See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). """ -const RAPID_SEED = UInt64(0xbdd89aa982704029) -const RAPID_SECRET = tuple( - 0x2d358dccaa6c78a5, - 0x8bb84b93962eacc9, - 0x4b33a62ed433d4a3, -) +hash(data) = hash(data, RAPID_SEED) +hash(@nospecialize(data), h::UInt64) = hash(objectid(data), h) + mul_hi64(A::UInt64, B::UInt64) = ((widen(A) * B) >> 64) % UInt64 rapid_mix(A, B) = mul_hi64(A, B) ⊻ (A * B) @@ -70,7 +72,7 @@ function hash( else pos = 1 i = buflen - if i > 48 + for _ in 1:div(buflen, 48) see1 = seed see2 = seed while i ≥ 48 @@ -162,8 +164,8 @@ else hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af) end - -hash(data::String, h::UInt64) = GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) +# hash(data::String, h::UInt64) = hash(length(data), h) +hash(data::String, h::UInt64) = @assume_effects :total GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) hash(w::WeakRef, h::UInt64) = hash(w.value, h) function hash(T::Type, h::UInt64) @@ -171,7 +173,3 @@ function hash(T::Type, h::UInt64) end hash(x::Symbol) = objectid(x) - -# generic dispatch -hash(data) = hash(data, RAPID_SEED) -hash(@nospecialize(data), h::UInt64) = hash(objectid(data), h) diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 40fff7243cd55..dda652e09bf51 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -142,7 +142,7 @@ module IteratorsMD # hashing const cartindexhash_seed = UInt == UInt64 ? 0xd60ca92f8284b8b0 : 0xf2ea7c2e function Base.hash(ci::CartesianIndex, h::UInt) - h += cartindexhash_seed + h ⊻= cartindexhash_seed for i in ci.I h = hash(i, h) end diff --git a/base/tuple.jl b/base/tuple.jl index 2ff8a1185a007..607cface2ee07 100644 --- a/base/tuple.jl +++ b/base/tuple.jl @@ -576,10 +576,10 @@ function _eq(t1::Any32, t2::Any32) end const tuplehash_seed = UInt === UInt64 ? 0x77cfa1eef01bca90 : 0xf01bca90 -hash(::Tuple{}, h::UInt) = h + tuplehash_seed +hash(::Tuple{}, h::UInt) = h ⊻ tuplehash_seed hash(t::Tuple, h::UInt) = hash(t[1], hash(tail(t), h)) function hash(t::Any32, h::UInt) - out = h + tuplehash_seed + out = h ⊻ tuplehash_seed for i = length(t):-1:1 out = hash(t[i], out) end diff --git a/stdlib/TOML/test/print.jl b/stdlib/TOML/test/print.jl index 8fba1b1c1df10..9569d3f33cc58 100644 --- a/stdlib/TOML/test/print.jl +++ b/stdlib/TOML/test/print.jl @@ -58,7 +58,7 @@ end [option] """ d = TOML.parse(s) - @test toml_str(d) == "user = \"me\"\n\n[julia]\n\n[option]\n" + @test toml_str(d; sorted=true) == "user = \"me\"\n\n[julia]\n\n[option]\n" end @testset "special characters" begin diff --git a/test/arrayops.jl b/test/arrayops.jl index b2da3eac6386b..4f3da315e4cda 100644 --- a/test/arrayops.jl +++ b/test/arrayops.jl @@ -2197,7 +2197,7 @@ end # All we really care about is that we have an optimized # implementation, but the seed is a useful way to check that. -@test hash(CartesianIndex()) == Base.IteratorsMD.cartindexhash_seed +@test hash(CartesianIndex()) == Base.IteratorsMD.cartindexhash_seed ⊻ Base.RAPID_SEED @test hash(CartesianIndex(1, 2)) != hash((1, 2)) @testset "itr, iterate" begin diff --git a/test/show.jl b/test/show.jl index bbc55ec285ad4..100302e1649b9 100644 --- a/test/show.jl +++ b/test/show.jl @@ -1951,7 +1951,7 @@ end # issue #27680 @test showstr(Set([(1.0,1.0), (2.0,2.0), (3.0, 3.0)])) == (sizeof(Int) == 8 ? - "Set([(1.0, 1.0), (3.0, 3.0), (2.0, 2.0)])" : + "Set([(2.0, 2.0), (3.0, 3.0), (1.0, 1.0)])" : "Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])") # issue #27747 diff --git a/test/tuple.jl b/test/tuple.jl index 13af5ac992434..25a8b17eb7cde 100644 --- a/test/tuple.jl +++ b/test/tuple.jl @@ -369,9 +369,9 @@ end @test !isless((1,2), (1,2)) @test !isless((2,1), (1,2)) - @test hash(()) === Base.tuplehash_seed - @test hash((1,)) === hash(1, Base.tuplehash_seed) - @test hash((1,2)) === hash(1, hash(2, Base.tuplehash_seed)) + @test hash(()) === Base.tuplehash_seed ⊻ Base.RAPID_SEED + @test hash((1,)) === hash(1, Base.tuplehash_seed ⊻ Base.RAPID_SEED) + @test hash((1,2)) === hash(1, hash(2, Base.tuplehash_seed ⊻ Base.RAPID_SEED)) # Test Any32 methods t = ntuple(identity, 32) @@ -393,7 +393,7 @@ end @test !isless((t...,1,2), (t...,1,2)) @test !isless((t...,2,1), (t...,1,2)) - @test hash(t) === foldr(hash, [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,(),UInt(0)]) + @test hash(t) === foldr(hash, vcat(1:32, (), Base.RAPID_SEED)) end @testset "functions" begin From f08ad3bdc9820bc8b17bd49334e1139178377119 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Mon, 24 Feb 2025 11:12:57 -0500 Subject: [PATCH 05/32] more `+` to `\xor` and `- 3h` to `, h` --- base/abstractarray.jl | 2 +- base/binaryplatforms.jl | 2 +- base/float.jl | 4 ++-- base/pkgid.jl | 2 +- base/regex.jl | 2 +- base/stacktraces.jl | 2 +- base/version.jl | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/base/abstractarray.jl b/base/abstractarray.jl index 1ab78a55c93b5..2051a81e7c536 100644 --- a/base/abstractarray.jl +++ b/base/abstractarray.jl @@ -3551,7 +3551,7 @@ sizehint!(a::AbstractVector, _) = a const hash_abstractarray_seed = UInt === UInt64 ? 0x7e2d6fb6448beb77 : 0xd4514ce5 function hash(A::AbstractArray, h::UInt) - h += hash_abstractarray_seed + h ⊻= hash_abstractarray_seed # Axes are themselves AbstractArrays, so hashing them directly would stack overflow # Instead hash the tuple of firsts and lasts along each dimension h = hash(map(first, axes(A)), h) diff --git a/base/binaryplatforms.jl b/base/binaryplatforms.jl index 0a68ce56e9a6b..7e29af19b0b0e 100644 --- a/base/binaryplatforms.jl +++ b/base/binaryplatforms.jl @@ -157,7 +157,7 @@ end # Hash definition to ensure that it's stable function Base.hash(p::Platform, h::UInt) - h += 0x506c6174666f726d % UInt + h ⊻= 0x506c6174666f726d % UInt h = hash(p.tags, h) h = hash(p.compare_strategies, h) return h diff --git a/base/float.jl b/base/float.jl index d2d68de0ef63f..78145171df765 100644 --- a/base/float.jl +++ b/base/float.jl @@ -736,7 +736,7 @@ function hash(x::Float64, h::UInt) elseif isnan(x) return hx_NaN ⊻ h # NaN does not have a stable bit pattern end - return hash(bitcast(UInt64, x)) - 3h + return hash(bitcast(UInt64, x), h) end hash(x::Float32, h::UInt) = hash(Float64(x), h) @@ -751,7 +751,7 @@ function hash(x::Float16, h::UInt) elseif isnan(x) return hx_NaN ⊻ h # NaN does not have a stable bit pattern end - return hash(bitcast(UInt64, Float64(x))) - 3h + return hash(bitcast(UInt64, Float64(x)), h) end ## generic hashing for rational values ## diff --git a/base/pkgid.jl b/base/pkgid.jl index 8c776d79a69cb..577529bbe7f63 100644 --- a/base/pkgid.jl +++ b/base/pkgid.jl @@ -17,7 +17,7 @@ end ==(a::PkgId, b::PkgId) = a.uuid == b.uuid && a.name == b.name function hash(pkg::PkgId, h::UInt) - h += 0xc9f248583a0ca36c % UInt + h ⊻= 0xc9f248583a0ca36c % UInt h = hash(pkg.uuid, h) h = hash(pkg.name, h) return h diff --git a/base/regex.jl b/base/regex.jl index 09922b8a25111..d822837d7d8d6 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -802,7 +802,7 @@ end ## hash ## const hashre_seed = UInt === UInt64 ? 0x67e195eb8555e72d : 0xe32373e4 function hash(r::Regex, h::UInt) - h += hashre_seed + h ⊻= hashre_seed h = hash(r.pattern, h) h = hash(r.compile_options, h) h = hash(r.match_options, h) diff --git a/base/stacktraces.jl b/base/stacktraces.jl index 806c9468efed4..ff7df04e80138 100644 --- a/base/stacktraces.jl +++ b/base/stacktraces.jl @@ -90,7 +90,7 @@ function ==(a::StackFrame, b::StackFrame) end function hash(frame::StackFrame, h::UInt) - h += 0xf4fbda67fe20ce88 % UInt + h ⊻= 0xf4fbda67fe20ce88 % UInt h = hash(frame.line, h) h = hash(frame.file, h) h = hash(frame.func, h) diff --git a/base/version.jl b/base/version.jl index b362daa78f04f..71192916a5b22 100644 --- a/base/version.jl +++ b/base/version.jl @@ -218,7 +218,7 @@ function isless(a::VersionNumber, b::VersionNumber) end function hash(v::VersionNumber, h::UInt) - h += 0x8ff4ffdb75f9fede % UInt + h ⊻= 0x8ff4ffdb75f9fede % UInt h = hash(v.major, h) h = hash(v.minor, h) h = hash(v.patch, h) From 7b3a1bf51353c41648836451653b20fcf1439c6c Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 25 Feb 2025 10:26:19 -0500 Subject: [PATCH 06/32] try feistel for h64 --- base/hashing.jl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 514feecd785b0..8d984cbc4ea75 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -33,6 +33,11 @@ See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). hash(data) = hash(data, RAPID_SEED) hash(@nospecialize(data), h::UInt64) = hash(objectid(data), h) +function mul_parts(a::UInt64, b::UInt64) + p = widemul(a, b) + return (p >> 64) % UInt64, p % UInt64 +end +rapid_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) mul_hi64(A::UInt64, B::UInt64) = ((widen(A) * B) >> 64) % UInt64 rapid_mix(A, B) = mul_hi64(A, B) ⊻ (A * B) @@ -112,21 +117,15 @@ function hash( a = a ⊻ secret[2] b = b ⊻ seed - a, b = a * b, mul_hi64(a, b) + b, a = mul_parts(a, b) return rapid_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2]) end function hash_64_64(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) - seed = seed ⊻ (rapid_mix(seed ⊻ secret[1], secret[2]) ⊻ 8) - - a = (UInt64(bswap((data >>> 32) % UInt32)) << 32) | UInt64(bswap(data % UInt32)) - b = (a << 32) | (a >>> 32) - a = a ⊻ secret[2] - b = b ⊻ seed - a, b = a * b, mul_hi64(a, b) - return rapid_mix(a ⊻ secret[1] ⊻ 8, b ⊻ secret[2]) + return data ⊻ rapid_mix(data ⊻ rapid_mix(seed, secret[1]), secret[2]) end + hash_64_32(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) = hash_64_64(data, seed, secret) % UInt32 hash_32_32(data::UInt32, seed::UInt64, secret::NTuple{3, UInt64}) = From 17322a2248d710d46a1a2e5e35a08f0b17dae424 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Wed, 26 Feb 2025 20:05:44 -0500 Subject: [PATCH 07/32] change name back --- base/hashing.jl | 30 ++++++++++++++---------------- base/strings/substring.jl | 2 +- test/arrayops.jl | 2 +- test/hashing.jl | 8 ++++---- test/tuple.jl | 8 ++++---- 5 files changed, 24 insertions(+), 26 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 8d984cbc4ea75..e45965eaec311 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -const RAPID_SEED = UInt64(0xbdd89aa982704029) -const RAPID_SECRET = tuple( +const HASH_SEED = UInt64(0xbdd89aa982704029) +const HASH_SECRET = tuple( 0x2d358dccaa6c78a5, 0x8bb84b93962eacc9, 0x4b33a62ed433d4a3, @@ -30,17 +30,15 @@ julia> hash(10, a) # only use the output of another hash function as the second See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). """ -hash(data) = hash(data, RAPID_SEED) +hash(data) = hash(data, HASH_SEED) hash(@nospecialize(data), h::UInt64) = hash(objectid(data), h) function mul_parts(a::UInt64, b::UInt64) p = widemul(a, b) return (p >> 64) % UInt64, p % UInt64 end -rapid_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) +hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) -mul_hi64(A::UInt64, B::UInt64) = ((widen(A) * B) >> 64) % UInt64 -rapid_mix(A, B) = mul_hi64(A, B) ⊻ (A * B) load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} = unsafe_load(convert(Ptr{T}, ptr + i - 1)) @@ -58,7 +56,7 @@ function hash( secret::NTuple{3, UInt64} ) buflen = UInt64(n) - seed = seed ⊻ (rapid_mix(seed ⊻ secret[1], secret[2]) ⊻ buflen) + seed = seed ⊻ (hash_mix(seed ⊻ secret[1], secret[2]) ⊻ buflen) a = zero(UInt64) b = zero(UInt64) @@ -81,15 +79,15 @@ function hash( see1 = seed see2 = seed while i ≥ 48 - seed = rapid_mix( + seed = hash_mix( load_le(UInt64, ptr, pos) ⊻ secret[1], load_le(UInt64, ptr, pos + 8) ⊻ seed ) - see1 = rapid_mix( + see1 = hash_mix( load_le(UInt64, ptr, pos + 16) ⊻ secret[2], load_le(UInt64, ptr, pos + 24) ⊻ see1 ) - see2 = rapid_mix( + see2 = hash_mix( load_le(UInt64, ptr, pos + 32) ⊻ secret[3], load_le(UInt64, ptr, pos + 40) ⊻ see2 ) @@ -99,12 +97,12 @@ function hash( seed = seed ⊻ see1 ⊻ see2 end if i > 16 - seed = rapid_mix( + seed = hash_mix( load_le(UInt64, ptr, pos) ⊻ secret[3], load_le(UInt64, ptr, pos + 8) ⊻ seed ⊻ secret[2] ) if i > 32 - seed = rapid_mix( + seed = hash_mix( load_le(UInt64, ptr, pos + 16) ⊻ secret[3], load_le(UInt64, ptr, pos + 24) ⊻ seed ) @@ -118,12 +116,12 @@ function hash( a = a ⊻ secret[2] b = b ⊻ seed b, a = mul_parts(a, b) - return rapid_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2]) + return hash_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2]) end function hash_64_64(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) - return data ⊻ rapid_mix(data ⊻ rapid_mix(seed, secret[1]), secret[2]) + return data ⊻ hash_mix(data ⊻ hash_mix(seed, secret[1]), secret[2]) end hash_64_32(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) = @@ -139,7 +137,7 @@ else const hash_uint = hash_32_32 end -hash(x::UInt64, h::UInt) = hash_uint64(x, h, RAPID_SECRET) +hash(x::UInt64, h::UInt) = hash_uint64(x, h, HASH_SECRET) hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) @@ -164,7 +162,7 @@ else end # hash(data::String, h::UInt64) = hash(length(data), h) -hash(data::String, h::UInt64) = @assume_effects :total GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) +hash(data::String, h::UInt64) = @assume_effects :total GC.@preserve data hash(pointer(data), sizeof(data), h, HASH_SECRET) hash(w::WeakRef, h::UInt64) = hash(w.value, h) function hash(T::Type, h::UInt64) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 8679864ef422e..8b9428c0be837 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -136,7 +136,7 @@ pointer(x::SubString{String}) = pointer(x.string) + x.offset pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) hash(data::SubString{String}, h::UInt64) = - GC.@preserve data hash(pointer(data), sizeof(data), h, RAPID_SECRET) + GC.@preserve data hash(pointer(data), sizeof(data), h, HASH_SECRET) _isannotated(::SubString{T}) where {T} = _isannotated(T) diff --git a/test/arrayops.jl b/test/arrayops.jl index 4f3da315e4cda..7528afc7f9592 100644 --- a/test/arrayops.jl +++ b/test/arrayops.jl @@ -2197,7 +2197,7 @@ end # All we really care about is that we have an optimized # implementation, but the seed is a useful way to check that. -@test hash(CartesianIndex()) == Base.IteratorsMD.cartindexhash_seed ⊻ Base.RAPID_SEED +@test hash(CartesianIndex()) == Base.IteratorsMD.cartindexhash_seed ⊻ Base.HASH_SEED @test hash(CartesianIndex(1, 2)) != hash((1, 2)) @testset "itr, iterate" begin diff --git a/test/hashing.jl b/test/hashing.jl index fed7f37117d8c..c831f4806030e 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -249,8 +249,8 @@ end ) for a in vals, b in vals - ha = Base.hash_64_32(a, Base.RAPID_SEED, Base.RAPID_SECRET) - hb = Base.hash_64_32(b, Base.RAPID_SEED, Base.RAPID_SECRET) + ha = Base.hash_64_32(a, Base.HASH_SEED, Base.HASH_SECRET) + hb = Base.hash_64_32(b, Base.HASH_SEED, Base.HASH_SECRET) @test isequal(a, b) == (ha == hb) end end @@ -262,8 +262,8 @@ end ) for a in vals, b in vals - ha = Base.hash_32_32(a, Base.RAPID_SEED, Base.RAPID_SECRET) - hb = Base.hash_32_32(b, Base.RAPID_SEED, Base.RAPID_SECRET) + ha = Base.hash_32_32(a, Base.HASH_SEED, Base.HASH_SECRET) + hb = Base.hash_32_32(b, Base.HASH_SEED, Base.HASH_SECRET) @test isequal(a, b) == (ha == hb) end end diff --git a/test/tuple.jl b/test/tuple.jl index 25a8b17eb7cde..560a1425c6bb6 100644 --- a/test/tuple.jl +++ b/test/tuple.jl @@ -369,9 +369,9 @@ end @test !isless((1,2), (1,2)) @test !isless((2,1), (1,2)) - @test hash(()) === Base.tuplehash_seed ⊻ Base.RAPID_SEED - @test hash((1,)) === hash(1, Base.tuplehash_seed ⊻ Base.RAPID_SEED) - @test hash((1,2)) === hash(1, hash(2, Base.tuplehash_seed ⊻ Base.RAPID_SEED)) + @test hash(()) === Base.tuplehash_seed ⊻ Base.HASH_SEED + @test hash((1,)) === hash(1, Base.tuplehash_seed ⊻ Base.HASH_SEED) + @test hash((1,2)) === hash(1, hash(2, Base.tuplehash_seed ⊻ Base.HASH_SEED)) # Test Any32 methods t = ntuple(identity, 32) @@ -393,7 +393,7 @@ end @test !isless((t...,1,2), (t...,1,2)) @test !isless((t...,2,1), (t...,1,2)) - @test hash(t) === foldr(hash, vcat(1:32, (), Base.RAPID_SEED)) + @test hash(t) === foldr(hash, vcat(1:32, (), Base.HASH_SEED)) end @testset "functions" begin From 948a56efe4972427c366c462062f1a61d7653957 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Wed, 26 Feb 2025 20:53:52 -0500 Subject: [PATCH 08/32] apply some review comments --- base/hashing.jl | 108 +++++++++++++++++++------------------- base/strings/substring.jl | 4 +- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index e45965eaec311..c9262a632a55a 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -30,8 +30,13 @@ julia> hash(10, a) # only use the output of another hash function as the second See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). """ -hash(data) = hash(data, HASH_SEED) -hash(@nospecialize(data), h::UInt64) = hash(objectid(data), h) +hash(data::Any) = hash(data, HASH_SEED) +hash(w::WeakRef, h::UInt) = hash(w.value, h) + +# Types can't be deleted, so marking as total allows the compiler to look up the hash +hash(T::Type, h::UInt) = + hash((Base.@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h) +hash(@nospecialize(data), h::UInt) = hash(objectid(data), h) function mul_parts(a::UInt64, b::UInt64) p = widemul(a, b) @@ -40,6 +45,50 @@ end hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) +function hash_64_64(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) + return data ⊻ hash_mix(data ⊻ hash_mix(seed, secret[1]), secret[2]) +end + +hash_64_32(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) = + hash_64_64(data, seed, secret) % UInt32 +hash_32_32(data::UInt32, seed::UInt64, secret::NTuple{3, UInt64}) = + hash_64_32(UInt64(data), seed, secret) + +if UInt === UInt64 + const hash_uint64 = hash_64_64 + const hash_uint = hash_64_64 +else + const hash_uint64 = hash_64_32 + const hash_uint = hash_32_32 +end + +hash(x::UInt64, h::UInt) = hash_uint64(promote(x, h)..., HASH_SECRET) +hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) +hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) + +function hash_integer(n::Integer, h::UInt) + h ⊻= hash((n % UInt) ⊻ h) + n = abs(n) + n >>>= sizeof(UInt) << 3 + while n != 0 + h ⊻= hash((n % UInt) ⊻ h) + n >>>= sizeof(UInt) << 3 + end + return h +end + +## symbol & expression hashing ## +if UInt === UInt64 + hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x83c7900696d26dc6)) + hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x2c97bf8b3de87020) +else + hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x96d26dc6)) + hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af) +end + +hash(x::Symbol) = objectid(x) + + load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} = unsafe_load(convert(Ptr{T}, ptr + i - 1)) @@ -49,7 +98,7 @@ function read_small(ptr::Ptr{UInt8}, n::Int) UInt64(unsafe_load(ptr, n)) end -function hash( +function hash_bytes( ptr::Ptr{UInt8}, n::Int, seed::UInt64, @@ -119,54 +168,5 @@ function hash( return hash_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2]) end - -function hash_64_64(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) - return data ⊻ hash_mix(data ⊻ hash_mix(seed, secret[1]), secret[2]) -end - -hash_64_32(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) = - hash_64_64(data, seed, secret) % UInt32 -hash_32_32(data::UInt32, seed::UInt64, secret::NTuple{3, UInt64}) = - hash_64_32(UInt64(data), seed, secret) - -if UInt === UInt64 - const hash_uint64 = hash_64_64 - const hash_uint = hash_64_64 -else - const hash_uint64 = hash_64_32 - const hash_uint = hash_32_32 -end - -hash(x::UInt64, h::UInt) = hash_uint64(x, h, HASH_SECRET) -hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) -hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) - -function hash_integer(n::Integer, h::UInt) - h ⊻= hash((n % UInt) ⊻ h) - n = abs(n) - n >>>= sizeof(UInt) << 3 - while n != 0 - h ⊻= hash((n % UInt) ⊻ h) - n >>>= sizeof(UInt) << 3 - end - return h -end - - -if UInt === UInt64 - hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x83c7900696d26dc6)) - hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x2c97bf8b3de87020) -else - hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x96d26dc6)) - hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af) -end - -# hash(data::String, h::UInt64) = hash(length(data), h) -hash(data::String, h::UInt64) = @assume_effects :total GC.@preserve data hash(pointer(data), sizeof(data), h, HASH_SECRET) - -hash(w::WeakRef, h::UInt64) = hash(w.value, h) -function hash(T::Type, h::UInt64) - return hash((Base.@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h) -end - -hash(x::Symbol) = objectid(x) +hash(data::String, h::UInt) = + @assume_effects :total GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 8b9428c0be837..0b812ed8246d3 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -135,8 +135,8 @@ end pointer(x::SubString{String}) = pointer(x.string) + x.offset pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) -hash(data::SubString{String}, h::UInt64) = - GC.@preserve data hash(pointer(data), sizeof(data), h, HASH_SECRET) +hash(data::SubString{String}, h::UInt) = + GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) _isannotated(::SubString{T}) where {T} = _isannotated(T) From 19dbcb4e05c48c107be99a9700aa8a43c33df680 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Thu, 27 Feb 2025 12:23:21 -0500 Subject: [PATCH 09/32] add :terminates_globally effect --- base/hashing.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index c9262a632a55a..f046c38c2e50c 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -98,7 +98,7 @@ function read_small(ptr::Ptr{UInt8}, n::Int) UInt64(unsafe_load(ptr, n)) end -function hash_bytes( +@assume_effects :terminates_globally function hash_bytes( ptr::Ptr{UInt8}, n::Int, seed::UInt64, @@ -168,5 +168,5 @@ function hash_bytes( return hash_mix(a ⊻ secret[1] ⊻ buflen, b ⊻ secret[2]) end -hash(data::String, h::UInt) = - @assume_effects :total GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) +@assume_effects :total hash(data::String, h::UInt) = + GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) From 8ab91a22ccb8d9b91ca9790a581b688ba05b8a8e Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 28 Feb 2025 09:16:01 -0500 Subject: [PATCH 10/32] simpler loop condition --- base/hashing.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/hashing.jl b/base/hashing.jl index f046c38c2e50c..016088c5eb32a 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -124,7 +124,7 @@ end else pos = 1 i = buflen - for _ in 1:div(buflen, 48) + while i ≥ 48 see1 = seed see2 = seed while i ≥ 48 From c9ba0a9c56c0620a3f1ed3a37007d77aadb824d2 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Mon, 3 Mar 2025 23:17:39 -0500 Subject: [PATCH 11/32] clean doctest --- base/hashing.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 016088c5eb32a..75d4f62205a3f 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -22,10 +22,10 @@ The hash value may change when a new Julia process is started. ```jldoctest julia> a = hash(10) -0x64abb6a0b5357846 +r"0x[0-9a-f]{16}" julia> hash(10, a) # only use the output of another hash function as the second argument -0xeb3ffc597ad4eafd +r"0x[0-9a-f]{16}" ``` See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). @@ -104,6 +104,9 @@ end seed::UInt64, secret::NTuple{3, UInt64} ) + # With reference and gratitude to [rapidhash](https://github.com/Nicoshev/rapidhash) + # while the integer hashing methods defined in this file share the mixing function + # used by rapidhash, only this method implements the algorithm in its entirety. buflen = UInt64(n) seed = seed ⊻ (hash_mix(seed ⊻ secret[1], secret[2]) ⊻ buflen) From 240d84be14f60390fe6dbd0d37dc416ab2c675ce Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Mon, 3 Mar 2025 23:30:01 -0500 Subject: [PATCH 12/32] change message --- base/hashing.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/hashing.jl b/base/hashing.jl index 75d4f62205a3f..d214ce5458c70 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -104,7 +104,7 @@ end seed::UInt64, secret::NTuple{3, UInt64} ) - # With reference and gratitude to [rapidhash](https://github.com/Nicoshev/rapidhash) + # Adapted with gratitude from [rapidhash](https://github.com/Nicoshev/rapidhash) # while the integer hashing methods defined in this file share the mixing function # used by rapidhash, only this method implements the algorithm in its entirety. buflen = UInt64(n) From 9e955e187a5065f95b02f618b8f9dfbc45a26255 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 4 Mar 2025 17:43:50 -0500 Subject: [PATCH 13/32] off by one --- base/hashing.jl | 4 ++-- test/hashing.jl | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index d214ce5458c70..75df985e0916d 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -161,8 +161,8 @@ end end end - a = load_le(UInt64, ptr, n - 17) - b = load_le(UInt64, ptr, n - 9) + a = load_le(UInt64, ptr, n - 15) + b = load_le(UInt64, ptr, n - 7) end a = a ⊻ secret[2] diff --git a/test/hashing.jl b/test/hashing.jl index c831f4806030e..098910d008177 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -88,6 +88,7 @@ vals = Any[ Dict(42 => 101, 77 => 93), Dict{Any,Any}(42 => 101, 77 => 93), (1,2,3,4), (1.0,2.0,3.0,4.0), (1,3,2,4), ("a","b"), (SubString("a",1,1), SubString("b",1,1)), + join('c':'s'), SubString(join('a':'z'), 3, 19), # issue #6900 Dict(x => x for x in 1:10), Dict(7=>7,9=>9,4=>4,10=>10,2=>2,3=>3,8=>8,5=>5,6=>6,1=>1), @@ -108,7 +109,7 @@ vals = Any[ ["a", "b", 1, 2], ["a", 1, 2], ["a", "b", 2, 2], ["a", "a", 1, 2], ["a", "b", 2, 3] ] -for a in vals, b in vals +for (i, a) in enumerate(vals), b in vals[i:end] @test isequal(a,b) == (hash(a)==hash(b)) end From 5ace2c59efcdc9ad47b2e332efbd24f857da00b5 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 4 Mar 2025 21:54:39 -0500 Subject: [PATCH 14/32] off byones --- base/hashing.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/base/hashing.jl b/base/hashing.jl index 75df985e0916d..7e4cdbdbba934 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -94,7 +94,7 @@ load_le(::Type{T}, ptr::Ptr{UInt8}, i) where {T <: Union{UInt32, UInt64}} = function read_small(ptr::Ptr{UInt8}, n::Int) return (UInt64(unsafe_load(ptr)) << 56) | - (UInt64(unsafe_load(ptr, div(n + 1, 2))) << 32) | + (UInt64(unsafe_load(ptr, div(n, 2) + 1)) << 32) | UInt64(unsafe_load(ptr, n)) end @@ -173,3 +173,7 @@ end @assume_effects :total hash(data::String, h::UInt) = GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) + +# no longer used in Base, but a lot of packages access these internals +const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed +const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81 \ No newline at end of file From 5410e0967c7f000da0855d0e89c5fe7e0607d66b Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 4 Mar 2025 21:59:08 -0500 Subject: [PATCH 15/32] clean up arith --- base/hashing.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 7e4cdbdbba934..26f9f7374686f 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -116,11 +116,11 @@ end if buflen ≤ 16 if buflen ≥ 4 a = (UInt64(load_le(UInt32, ptr, 1)) << 32) | - UInt64(load_le(UInt32, ptr, n - 4 + 1)) + UInt64(load_le(UInt32, ptr, n - 3)) delta = (buflen & 24) >>> (buflen >>> 3) b = (UInt64(load_le(UInt32, ptr, delta + 1)) << 32) | - UInt64(load_le(UInt32, ptr, n - 4 - delta + 1)) + UInt64(load_le(UInt32, ptr, n - 3 - delta)) elseif buflen > 0 a = read_small(ptr, n) end @@ -176,4 +176,4 @@ end # no longer used in Base, but a lot of packages access these internals const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed -const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81 \ No newline at end of file +const memhash_seed = UInt === UInt64 ? 0x71e729fd56419c81 : 0x56419c81 From 62f5904338f0267b01bcbcf45912b738a797d044 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 4 Mar 2025 23:01:12 -0500 Subject: [PATCH 16/32] fix show test again --- test/show.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/show.jl b/test/show.jl index 100302e1649b9..a9aa874bd16c3 100644 --- a/test/show.jl +++ b/test/show.jl @@ -1950,8 +1950,8 @@ end @test replstr(view(A, [1], :)) == "1×1 view(::Matrix{Float64}, [1], :) with eltype Float64:\n 0.0" # issue #27680 - @test showstr(Set([(1.0,1.0), (2.0,2.0), (3.0, 3.0)])) == (sizeof(Int) == 8 ? - "Set([(2.0, 2.0), (3.0, 3.0), (1.0, 1.0)])" : + @test showstr(Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])) == (sizeof(Int) == 8 ? + "Set([(1.0, 1.0), (3.0, 3.0), (2.0, 2.0)])" : "Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])") # issue #27747 From b41e6b9bb1e1589c9bdcba83ec4fdaaf6f1d8abb Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 7 Mar 2025 15:07:45 -0500 Subject: [PATCH 17/32] friendlier to 32bit systems --- base/hashing.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 26f9f7374686f..c4e31418ee96c 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -const HASH_SEED = UInt64(0xbdd89aa982704029) +const HASH_SEED = UInt == UInt64 ? 0xbdd89aa982704029 : 0xeabe9406 const HASH_SECRET = tuple( 0x2d358dccaa6c78a5, 0x8bb84b93962eacc9, @@ -45,13 +45,13 @@ end hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) -function hash_64_64(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) - return data ⊻ hash_mix(data ⊻ hash_mix(seed, secret[1]), secret[2]) +function hash_64_64(data::UInt64, seed::UInt, secret::NTuple{3, UInt64}) + return data ⊻ hash_mix(data ⊻ hash_mix(UInt64(seed), secret[1]), secret[2]) end -hash_64_32(data::UInt64, seed::UInt64, secret::NTuple{3, UInt64}) = +hash_64_32(data::UInt64, seed::UInt, secret::NTuple{3, UInt64}) = hash_64_64(data, seed, secret) % UInt32 -hash_32_32(data::UInt32, seed::UInt64, secret::NTuple{3, UInt64}) = +hash_32_32(data::UInt32, seed::UInt, secret::NTuple{3, UInt64}) = hash_64_32(UInt64(data), seed, secret) if UInt === UInt64 @@ -62,7 +62,7 @@ else const hash_uint = hash_32_32 end -hash(x::UInt64, h::UInt) = hash_uint64(promote(x, h)..., HASH_SECRET) +hash(x::UInt64, h::UInt) = hash_uint64(x, h, HASH_SECRET) hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) @@ -172,7 +172,7 @@ end end @assume_effects :total hash(data::String, h::UInt) = - GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) + GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt # no longer used in Base, but a lot of packages access these internals const memhash = UInt === UInt64 ? :memhash_seed : :memhash32_seed From 8c8ac45e05788272db84e74596cede15af65bd7c Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 7 Mar 2025 15:28:45 -0500 Subject: [PATCH 18/32] uint32 for substring hash --- base/hashing.jl | 8 ++++---- base/strings/substring.jl | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index c4e31418ee96c..3261d4e7d38fc 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -79,11 +79,11 @@ end ## symbol & expression hashing ## if UInt === UInt64 - hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x83c7900696d26dc6)) - hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x2c97bf8b3de87020) + hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h ⊻ 0x83c7900696d26dc6)) + hash(x::QuoteNode, h::UInt) = hash(x.value, h ⊻ 0x2c97bf8b3de87020) else - hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h + 0x96d26dc6)) - hash(x::QuoteNode, h::UInt) = hash(x.value, h + 0x469d72af) + hash(x::Expr, h::UInt) = hash(x.args, hash(x.head, h ⊻ 0x469d72af)) + hash(x::QuoteNode, h::UInt) = hash(x.value, h ⊻ 0x469d72af) end hash(x::Symbol) = objectid(x) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index 0b812ed8246d3..a15f5e017f5cd 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -136,7 +136,7 @@ pointer(x::SubString{String}) = pointer(x.string) + x.offset pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) hash(data::SubString{String}, h::UInt) = - GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) + GC.@preserve data hash_bytes(pointer(data), sizeof(data), UInt64(h), HASH_SECRET) % UInt _isannotated(::SubString{T}) where {T} = _isannotated(T) From 15eb0f941aa182a202e9975a7b50f247ba0e7e4d Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 18 Mar 2025 21:37:10 -0400 Subject: [PATCH 19/32] re-change float --- base/hashing.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index ddf19fae14a88..b8cd18305c3c9 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -79,7 +79,7 @@ end ## efficient value-based hashing of floats ## -const hx_NaN = hash_uint64(reinterpret(UInt64, NaN)) +const hx_NaN = hash(reinterpret(UInt64, NaN)) function hash(x::Float64, h::UInt) # see comments on trunc and hash(Real, UInt) if typemin(Int64) <= x < typemax(Int64) @@ -95,7 +95,7 @@ function hash(x::Float64, h::UInt) elseif isnan(x) return hx_NaN ⊻ h # NaN does not have a stable bit pattern end - return hash_uint64(bitcast(UInt64, x)) - 3h + return hash(bitcast(UInt64, x), h) end hash(x::Float32, h::UInt) = hash(Float64(x), h) @@ -110,7 +110,7 @@ function hash(x::Float16, h::UInt) elseif isnan(x) return hx_NaN ⊻ h # NaN does not have a stable bit pattern end - return hash_uint64(bitcast(UInt64, Float64(x))) - 3h + return hash(bitcast(UInt64, Float64(x)), h) end ## generic hashing for rational values ## From ae7bee8a7a58b9981f2b37a931ee4d00fadc322c Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 28 Mar 2025 10:41:48 -0400 Subject: [PATCH 20/32] slightly narrow scope of PR; restore x-3h mixing --- base/char.jl | 2 +- base/gmp.jl | 8 ++++---- base/hashing.jl | 23 +++++++++++++++-------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/base/char.jl b/base/char.jl index 1885a6620638d..be37adfc37425 100644 --- a/base/char.jl +++ b/base/char.jl @@ -222,7 +222,7 @@ in(x::AbstractChar, y::AbstractChar) = x == y ==(x::Char, y::Char) = bitcast(UInt32, x) == bitcast(UInt32, y) isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y) hash(x::Char, h::UInt) = - hash(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) + hash_finalizer(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) # fallbacks: isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y)) diff --git a/base/gmp.jl b/base/gmp.jl index 224b8e8b92e6c..6a5b05f6a7fc2 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -857,9 +857,9 @@ if Limb === UInt64 === UInt s == 0 && return hash_integer(0, h) p = convert(Ptr{UInt64}, n.d) b = unsafe_load(p) - h ⊻= hash(ifelse(s < 0, -b, b) ⊻ h) + h ⊻= hash_finalizer(ifelse(s < 0, -b, b) ⊻ h) for k = 2:abs(s) - h ⊻= hash(unsafe_load(p, k) ⊻ h) + h ⊻= hash_finalizer(unsafe_load(p, k) ⊻ h) end return h end @@ -893,7 +893,7 @@ if Limb === UInt64 === UInt return hash(ldexp(flipsign(Float64(limb), sz), pow), h) end h = hash_integer(pow, h) - h ⊻= hash(flipsign(limb, sz) ⊻ h) + h ⊻= hash_finalizer(flipsign(limb, sz) ⊻ h) for idx = idx+1:asz if shift == 0 limb = unsafe_load(ptr, idx) @@ -907,7 +907,7 @@ if Limb === UInt64 === UInt limb = limb2 << upshift | limb1 >> shift end end - h ⊻= hash(limb ⊻ h) + h ⊻= hash_finalizer(limb ⊻ h) end return h end diff --git a/base/hashing.jl b/base/hashing.jl index 9e78db940679c..caf85848ed425 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -44,15 +44,22 @@ function mul_parts(a::UInt64, b::UInt64) end hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) - -function hash_64_64(data::UInt64, seed::UInt, secret::NTuple{3, UInt64}) - return data ⊻ hash_mix(data ⊻ hash_mix(UInt64(seed), secret[1]), secret[2]) +# faster-but-weaker than hash_mix intended for small keys +hash_mix_linear(a::UInt, b::UInt) = a - 3b + +function hash_finalizer(x) + # constants arduously discovered by Pelle Evensen + # https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html + x ⊻= (x >> 27) + x *= 0x3c79ac492ba7b653 + x ⊻= x >> 33 + x *= 0x1c69b3f74ac4ae35 + x ⊻= x >> 27 end -hash_64_32(data::UInt64, seed::UInt, secret::NTuple{3, UInt64}) = - hash_64_64(data, seed, secret) % UInt32 -hash_32_32(data::UInt32, seed::UInt, secret::NTuple{3, UInt64}) = - hash_64_32(UInt64(data), seed, secret) +hash_64_64(data::UInt64, seed::UInt) = hash_finalizer(hash_mix_linear(data, seed)) +hash_64_32(data::UInt64, seed::UInt) = hash_64_64(data, seed) % UInt32 +hash_32_32(data::UInt32, seed::UInt) = hash_64_32(UInt64(data), seed) if UInt === UInt64 const hash_uint64 = hash_64_64 @@ -62,7 +69,7 @@ else const hash_uint = hash_32_32 end -hash(x::UInt64, h::UInt) = hash_uint64(x, h, HASH_SECRET) +hash(x::UInt64, h::UInt) = hash_uint64(x, h) hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) From c9cd088009a90745896b5d7af7de05041460065f Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 28 Mar 2025 10:53:31 -0400 Subject: [PATCH 21/32] fix merge with doctests --- base/hashing.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index caf85848ed425..7c2aaa497618b 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -22,10 +22,10 @@ The hash value may change when a new Julia process is started. ```jldoctest; filter = r"0x[0-9a-f]{16}" julia> a = hash(10) -r"0x[0-9a-f]{16}" +0x759d18cc5346a65f julia> hash(10, a) # only use the output of another hash function as the second argument -r"0x[0-9a-f]{16}" +0x03158cd61b1b0bd1 ``` See also: [`objectid`](@ref), [`Dict`](@ref), [`Set`](@ref). From 8cd46ce9d45aa828f9c0fe0340b2aa685ede2495 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 28 Mar 2025 12:38:32 -0400 Subject: [PATCH 22/32] fix 32bit, again, hopefully? --- base/hashing.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 7c2aaa497618b..95babaa30a303 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -45,12 +45,12 @@ end hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) # faster-but-weaker than hash_mix intended for small keys -hash_mix_linear(a::UInt, b::UInt) = a - 3b +hash_mix_linear(x::UInt64, h::UInt) = x - 3h -function hash_finalizer(x) +function hash_finalizer(x::UInt64) # constants arduously discovered by Pelle Evensen # https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html - x ⊻= (x >> 27) + x ⊻= x >> 27 x *= 0x3c79ac492ba7b653 x ⊻= x >> 33 x *= 0x1c69b3f74ac4ae35 From f55bb359250beca00b985ff91e9473d225931ba7 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Fri, 28 Mar 2025 13:58:02 -0400 Subject: [PATCH 23/32] ugh --- base/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index be37adfc37425..22ffa977ca6ed 100644 --- a/base/char.jl +++ b/base/char.jl @@ -222,7 +222,7 @@ in(x::AbstractChar, y::AbstractChar) = x == y ==(x::Char, y::Char) = bitcast(UInt32, x) == bitcast(UInt32, y) isless(x::Char, y::Char) = bitcast(UInt32, x) < bitcast(UInt32, y) hash(x::Char, h::UInt) = - hash_finalizer(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) + hash_finalizer(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) ⊻ UInt64(h)) % UInt # fallbacks: isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y)) From f1ed02d059a3dea39aaa44688de7befff7886a77 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sat, 5 Apr 2025 12:14:25 -0400 Subject: [PATCH 24/32] udpate comment --- base/hashing.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 95babaa30a303..0c0c33e90a712 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -193,8 +193,6 @@ end secret::NTuple{3, UInt64} ) # Adapted with gratitude from [rapidhash](https://github.com/Nicoshev/rapidhash) - # while the integer hashing methods defined in this file share the mixing function - # used by rapidhash, only this method implements the algorithm in its entirety. buflen = UInt64(n) seed = seed ⊻ (hash_mix(seed ⊻ secret[1], secret[2]) ⊻ buflen) From bfb1138346ee8ae0c40facd83fb5736369c3d3ac Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 6 Apr 2025 18:55:23 -0400 Subject: [PATCH 25/32] try a simpler finalizer and remove Base qualifier --- base/hashing.jl | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/base/hashing.jl b/base/hashing.jl index 0c0c33e90a712..102d9bd7e827e 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -35,7 +35,7 @@ hash(w::WeakRef, h::UInt) = hash(w.value, h) # Types can't be deleted, so marking as total allows the compiler to look up the hash hash(T::Type, h::UInt) = - hash((Base.@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h) + hash((@assume_effects :total ccall(:jl_type_hash, UInt, (Any,), T)), h) hash(@nospecialize(data), h::UInt) = hash(objectid(data), h) function mul_parts(a::UInt64, b::UInt64) @@ -45,17 +45,8 @@ end hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) # faster-but-weaker than hash_mix intended for small keys -hash_mix_linear(x::UInt64, h::UInt) = x - 3h - -function hash_finalizer(x::UInt64) - # constants arduously discovered by Pelle Evensen - # https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html - x ⊻= x >> 27 - x *= 0x3c79ac492ba7b653 - x ⊻= x >> 33 - x *= 0x1c69b3f74ac4ae35 - x ⊻= x >> 27 -end +hash_mix_linear(x::UInt64, h::UInt) = 3h - x +hash_finalizer(x::UInt64) = (x ⊻ 0x8d95d0cb4f172723) * 0xd32df181e077609d hash_64_64(data::UInt64, seed::UInt) = hash_finalizer(hash_mix_linear(data, seed)) hash_64_32(data::UInt64, seed::UInt) = hash_64_64(data, seed) % UInt32 From dd0e1bcdcfbb1f3dd6ea176baa8d27031c5c6497 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Wed, 9 Apr 2025 00:52:05 -0400 Subject: [PATCH 26/32] last update to finalizer --- base/hashing.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/base/hashing.jl b/base/hashing.jl index 102d9bd7e827e..0b8c63358a69f 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -46,7 +46,12 @@ hash_mix(a::UInt64, b::UInt64) = ⊻(mul_parts(a, b)...) # faster-but-weaker than hash_mix intended for small keys hash_mix_linear(x::UInt64, h::UInt) = 3h - x -hash_finalizer(x::UInt64) = (x ⊻ 0x8d95d0cb4f172723) * 0xd32df181e077609d +function hash_finalizer(x::UInt64) + x ⊻= (x >> 32) + x *= 0x63652a4cd374b267 + x ⊻= (x >> 33) + return x +end hash_64_64(data::UInt64, seed::UInt) = hash_finalizer(hash_mix_linear(data, seed)) hash_64_32(data::UInt64, seed::UInt) = hash_64_64(data, seed) % UInt32 From 848b4446bb01650dd4ad3ef3ef95006870b726ef Mon Sep 17 00:00:00 2001 From: adienes <51664769+adienes@users.noreply.github.com> Date: Thu, 24 Apr 2025 10:58:53 -0700 Subject: [PATCH 27/32] Update base/gmp.jl Co-authored-by: Jeff Bezanson --- base/gmp.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/gmp.jl b/base/gmp.jl index a28b429be6415..c42c97ebc681f 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -847,7 +847,7 @@ if Limb === UInt64 === UInt # an optimized version for BigInt of hash_integer (used e.g. for Rational{BigInt}), # and of hash - using .Base: hash + using .Base: hash_finalizer function hash_integer(n::BigInt, h::UInt) GC.@preserve n begin From ce8885468bc896a5b24c775c3fcc3029e58c48f1 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 27 Apr 2025 11:00:00 -0700 Subject: [PATCH 28/32] apply review --- base/gmp.jl | 6 +++--- base/hashing.jl | 12 ++++++------ test/hashing.jl | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/base/gmp.jl b/base/gmp.jl index c42c97ebc681f..dd271d1b8fcb5 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -847,7 +847,7 @@ if Limb === UInt64 === UInt # an optimized version for BigInt of hash_integer (used e.g. for Rational{BigInt}), # and of hash - using .Base: hash_finalizer + using .Base: hash_uint function hash_integer(n::BigInt, h::UInt) GC.@preserve n begin @@ -855,9 +855,9 @@ if Limb === UInt64 === UInt s == 0 && return hash_integer(0, h) p = convert(Ptr{UInt64}, n.d) b = unsafe_load(p) - h ⊻= hash_finalizer(ifelse(s < 0, -b, b) ⊻ h) + h ⊻= hash_uint(ifelse(s < 0, -b, b) ⊻ h) for k = 2:abs(s) - h ⊻= hash_finalizer(unsafe_load(p, k) ⊻ h) + h ⊻= hash_uint(unsafe_load(p, k) ⊻ h) end return h end diff --git a/base/hashing.jl b/base/hashing.jl index 0b8c63358a69f..c409d3ae7940f 100644 --- a/base/hashing.jl +++ b/base/hashing.jl @@ -53,9 +53,9 @@ function hash_finalizer(x::UInt64) return x end -hash_64_64(data::UInt64, seed::UInt) = hash_finalizer(hash_mix_linear(data, seed)) -hash_64_32(data::UInt64, seed::UInt) = hash_64_64(data, seed) % UInt32 -hash_32_32(data::UInt32, seed::UInt) = hash_64_32(UInt64(data), seed) +hash_64_64(data::UInt64) = hash_finalizer(data) +hash_64_32(data::UInt64) = hash_64_64(data) % UInt32 +hash_32_32(data::UInt32) = hash_64_32(UInt64(data)) if UInt === UInt64 const hash_uint64 = hash_64_64 @@ -65,16 +65,16 @@ else const hash_uint = hash_32_32 end -hash(x::UInt64, h::UInt) = hash_uint64(x, h) +hash(x::UInt64, h::UInt) = hash_uint64(hash_mix_linear(x, h)) hash(x::Int64, h::UInt) = hash(bitcast(UInt64, x), h) hash(x::Union{Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32}, h::UInt) = hash(Int64(x), h) function hash_integer(n::Integer, h::UInt) - h ⊻= hash((n % UInt) ⊻ h) + h ⊻= hash_uint((n % UInt) ⊻ h) n = abs(n) n >>>= sizeof(UInt) << 3 while n != 0 - h ⊻= hash((n % UInt) ⊻ h) + h ⊻= hash_uint((n % UInt) ⊻ h) n >>>= sizeof(UInt) << 3 end return h diff --git a/test/hashing.jl b/test/hashing.jl index 098910d008177..41a1d525961cc 100644 --- a/test/hashing.jl +++ b/test/hashing.jl @@ -250,8 +250,8 @@ end ) for a in vals, b in vals - ha = Base.hash_64_32(a, Base.HASH_SEED, Base.HASH_SECRET) - hb = Base.hash_64_32(b, Base.HASH_SEED, Base.HASH_SECRET) + ha = Base.hash_64_32(a) + hb = Base.hash_64_32(b) @test isequal(a, b) == (ha == hb) end end @@ -263,8 +263,8 @@ end ) for a in vals, b in vals - ha = Base.hash_32_32(a, Base.HASH_SEED, Base.HASH_SECRET) - hb = Base.hash_32_32(b, Base.HASH_SEED, Base.HASH_SECRET) + ha = Base.hash_32_32(a) + hb = Base.hash_32_32(b) @test isequal(a, b) == (ha == hb) end end From d6b75a8ff7bfd1de68afbcc78d031e81a0c80d9b Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Sun, 27 Apr 2025 11:23:12 -0700 Subject: [PATCH 29/32] add NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index c6db2bbfbbe28..0a44b7adf5ef7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,8 @@ New language features Language changes ---------------- + - The `hash` algorithm and its values have changed. Most `hash` specializations will remain correct and require no action. Types that reimplement the core hashing logic independently, such as some third-party string packages do, may require a migration to the new algorithm. ([#57509]) + Compiler/Runtime improvements ----------------------------- From 574cb151fff9f93de2b1b24c40b393ac37fde8d2 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Tue, 29 Apr 2025 10:34:30 -0400 Subject: [PATCH 30/32] re-fix test --- test/show.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/show.jl b/test/show.jl index 67b5579d446eb..dd6409d7da0f0 100644 --- a/test/show.jl +++ b/test/show.jl @@ -1958,7 +1958,7 @@ end # issue #27680 @test showstr(Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])) == (sizeof(Int) == 8 ? - "Set([(1.0, 1.0), (3.0, 3.0), (2.0, 2.0)])" : + "Set([(2.0, 2.0), (1.0, 1.0), (3.0, 3.0)])" : "Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])") # issue #27747 From a5f44c2b1032fe4a3c9509372dd96a9949b180e1 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Wed, 7 May 2025 13:34:44 -0400 Subject: [PATCH 31/32] 32-bit showstr(::Set) test --- test/show.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/show.jl b/test/show.jl index dd6409d7da0f0..fa5989d6cd91d 100644 --- a/test/show.jl +++ b/test/show.jl @@ -1959,7 +1959,7 @@ end # issue #27680 @test showstr(Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])) == (sizeof(Int) == 8 ? "Set([(2.0, 2.0), (1.0, 1.0), (3.0, 3.0)])" : - "Set([(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)])") + "Set([(2.0, 2.0), (1.0, 1.0), (3.0, 3.0)])") # issue #27747 let t = (x = Integer[1, 2],) From 42076d924cade11acbc711ca6a599802dff1bfc2 Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Wed, 7 May 2025 15:44:38 -0400 Subject: [PATCH 32/32] TOML tests... --- stdlib/TOML/test/print.jl | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/stdlib/TOML/test/print.jl b/stdlib/TOML/test/print.jl index b344e74672fdd..9734d96b3c8c1 100644 --- a/stdlib/TOML/test/print.jl +++ b/stdlib/TOML/test/print.jl @@ -83,17 +83,28 @@ loaders = ["gzip", { driver = "csv", args = {delim = "\t"}}] @testset "vec with dicts and non-dicts" begin # https://github.com/JuliaLang/julia/issues/45340 d = Dict("b" => Any[111, Dict("a" => 222, "d" => 333)]) - @test toml_str(d) == "b = [111, {a = 222, d = 333}]\n" + @test toml_str(d) == (sizeof(Int) == 8 ? + "b = [111, {a = 222, d = 333}]\n" : + "b = [111, {d = 333, a = 222}]\n") + d = Dict("b" => Any[Dict("a" => 222, "d" => 333), 111]) - @test toml_str(d) == "b = [{a = 222, d = 333}, 111]\n" + @test toml_str(d) == (sizeof(Int) == 8 ? + "b = [{a = 222, d = 333}, 111]\n" : + "b = [{d = 333, a = 222}, 111]\n") d = Dict("b" => Any[Dict("a" => 222, "d" => 333)]) - @test toml_str(d) == """ - [[b]] - a = 222 - d = 333 - """ + @test toml_str(d) == (sizeof(Int) == 8 ? + """ + [[b]] + a = 222 + d = 333 + """ : + """ + [[b]] + d = 333 + a = 222 + """) # https://github.com/JuliaLang/julia/pull/57584 d = Dict("b" => [MyStruct(1), MyStruct(2)])