Merge pull request #75 from xiaodaigh/github-73

fixed #73
xiaodaigh · Jan 16, 2022 · 0eeba95 · 0eeba95 · xiaodaigh · Jan 16, 2022
2 parents 244a8a7 + c3ea475
commit 0eeba95
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 14 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "JDF"
 uuid = "babc3d20-cd49-4f60-a736-a8f9c08892d3"
 authors = ["Dai ZJ <zhuojia.dai@gmail.com>"]
-version = "0.4.5"
+version = "0.4.6"
 
 [deps]
 Blosc = "a74b3585-a348-5f62-a45c-50e91977d574"

diff --git a/src/type-writer-loader/Missing.jl b/src/type-writer-loader/Missing.jl
@@ -5,7 +5,7 @@ some_elm(::Type{Missing}) = missing
 # the dispatch for Union{T, Missing}
 # 1. compress the missing
 # 2. and also load the missing
-compress_then_write(b::Vector{Union{T,Missing}}, io) where {T} = begin
+function compress_then_write(b::Vector{Union{T,Missing}}, io) where {T}
     b_S = coalesce.(b, some_elm(T))
 
     metadata = compress_then_write(b_S, io)
@@ -26,18 +26,18 @@ end
 compress_then_write(b::Vector{Missing}, _) =
     (len = 0, type = Missing, orig_len = length(b))
 
-column_loader!(buffer, ::Type{Union{Missing,T}}, io, metadata) where {T} = begin
+function column_loader!(buffer, ::Type{Union{Missing,T}}, io, metadata) where {T}
     # read the content
     Tmeta = metadata.Tmeta
 
     t_pre = column_loader!(buffer, Tmeta.type, io, Tmeta) |> allowmissing
-    #t = t_pre
+
     # read the missings as bool
     m = column_loader(Bool, io, metadata.missingmeta)
-    #return t_pre
+
     t_pre[m] .= missing
     t_pre
 end
 
-column_loader!(buffer, ::Type{Missing}, io, metadata) =
+column_loader!(_, ::Type{Missing}, io, metadata) =
     Vector{Missing}(missing, metadata.orig_len)
diff --git a/src/type-writer-loader/categorical-arrays.jl b/src/type-writer-loader/categorical-arrays.jl
@@ -2,8 +2,7 @@ using DataAPI
 
 using CategoricalArrays: CategoricalVector, CategoricalArray, CategoricalPool
 
-compress_then_write(b::CategoricalVector{T,IntType}, io) where {T, IntType<:Integer} = begin
-    #println("abc")
+function compress_then_write(b::CategoricalVector{T,IntType}, io) where {T, IntType<:Integer}
     compress_refs = compress_then_write(b.refs, io)
     compress_poolindex = compress_then_write(DataAPI.levels(b), io)
 
@@ -15,15 +14,38 @@ compress_then_write(b::CategoricalVector{T,IntType}, io) where {T, IntType<:Inte
     )
 end
 
-column_loader(b::Type{CategoricalVector}, io, metadata) = begin
+# function column_loader(::Type{CategoricalVector{Union{Missing, T}, I}}, io, metadata) where {T, I}
+#     println("got here1")
+#     refs_meta = metadata.refs
+#     pi_meta = metadata.poolindex
+#     ref = column_loader(refs_meta.type, io, refs_meta)
+#     poolindex = column_loader(pi_meta.type, io, pi_meta)
+
+#     return CategoricalArray{pi_meta.type,1}(
+#         ref,
+#         CategoricalPool{eltype(poolindex),eltype(ref)}(Array(poolindex), metadata.ordered),
+#     )
+# end
+
+function column_loader(::Type{CategoricalVector}, io, metadata)
     refs_meta = metadata.refs
     pi_meta = metadata.poolindex
     ref = column_loader(refs_meta.type, io, refs_meta)
     poolindex = column_loader(pi_meta.type, io, pi_meta)
-    CategoricalArray{pi_meta.type,1}(
-        ref,
-        CategoricalPool{eltype(poolindex),eltype(ref)}(Array(poolindex), metadata.ordered),
-    )
+
+    # this checks for missing in the values which would be represented by ref = 0
+    if any(==(0), ref)
+        return CategoricalArray{Union{pi_meta.type, Missing},1}(
+            ref,
+            CategoricalPool{eltype(poolindex),eltype(ref)}(Array(poolindex), metadata.ordered),
+        )
+    else
+        # no missing in the values, just return
+        return CategoricalArray{pi_meta.type,1}(
+            ref,
+            CategoricalPool{eltype(poolindex),eltype(ref)}(Array(poolindex), metadata.ordered),
+        )
+    end
 end
 
 if false

diff --git a/test/test-categorical-ararys.jl b/test/test-categorical-ararys.jl
@@ -12,7 +12,7 @@ using Tables
     JDF.save(df, "a3cate.jdf")
     df_loaded_back = JDF.load("a3cate.jdf", cols = [:x2, :x1])
 
-    df2 = DataFrame(df_loaded_back; copycols=true)
+    df2 = DataFrame(df_loaded_back; copycols = true)
     @test size(df2, 2) == 2
     @test size(df2, 1) == 100
     @time df2[!, :x1] isa CategoricalVector{Int}
@@ -28,3 +28,15 @@ end
 
     rm("iris.jdf", force = true, recursive = true)
 end
+
+@testset "CategoricalArray{Union{Missing, String}}" begin
+    # Guard against github 73
+    df2 = DataFrame(sex = categorical(["Male", missing, "Female"]))
+    JDF.save("df2.jdf", df2)
+
+    b = JDF.load("df2.jdf") |> DataFrame
+
+    @test any(ismissing, b.sex)
+
+    rm("df2.jdf", force=true, recursive=true)
+end