Skip to content

Commit 78fb514

Browse files
committed
More nippy/arrow optimizations.
1 parent b82d716 commit 78fb514

File tree

3 files changed

+28
-18
lines changed

3 files changed

+28
-18
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
# Changelog
2+
# 7.037
3+
* Nippy loading is about 2x faster in the case of large string tables.
4+
* Arrow read pathways support :text-as-strings? to mirror :strings-as-text? on the write side so you can save out uncompressed data in the fastest-to-read format.
5+
26
# 7.036
37
* Major optimization (>9x!) loading of arrow files when large string tables/dictionaries are used.
48

src/tech/v3/dataset/base.clj

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -918,17 +918,20 @@
918918
string-data ^bytes string-data
919919
offsets (dtype/->buffer offsets)
920920
n-elems (dec (.lsize offsets))
921-
^IMutList int->str
922-
(->> (dtype/make-reader
923-
:string n-elems
924-
(let [start-off (.readLong offsets idx)
925-
end-off (.readLong offsets (inc idx))]
926-
(String. string-data start-off (- end-off start-off))))
927-
(dtype/make-container :list :string))
928-
str->int (HashMap. (dtype/ecount int->str))]
929-
(dotimes [idx n-elems]
930-
(.put str->int (.get int->str idx) idx))
931-
(StringTable. int->str str->int int-data))
921+
str-rdr (dtype/make-reader
922+
:string n-elems
923+
(let [start-off (.readLong offsets idx)
924+
end-off (.readLong offsets (inc idx))]
925+
(String. string-data start-off (- end-off start-off))))
926+
str-ary (hamf/object-array n-elems)]
927+
(hamf/pgroups n-elems (fn [^long sidx ^long eidx]
928+
(loop [idx sidx]
929+
(when (< idx eidx)
930+
(let [start-off (.readLong offsets idx)
931+
end-off (.readLong offsets (inc idx))]
932+
(aset str-ary idx (String. string-data start-off (- end-off start-off))))
933+
(recur (inc idx))))))
934+
(StringTable. (ham_fisted.ArrayLists/toList str-ary) nil int-data))
932935
(= version 2)
933936
(let [^List int->str (dtype-list/wrap-container string-table)
934937
str->int (HashMap. (dtype/ecount int->str))

src/tech/v3/libs/arrow.clj

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,7 +1313,7 @@ Dependent block frames are not supported!!")
13131313

13141314

13151315
(defn- string-data->column-data
1316-
[dict-map encoding offset-buf-dtype buffers n-elems]
1316+
[dict-map encoding offset-buf-dtype buffers n-elems options]
13171317
(if encoding
13181318
(StringTable. (get-in dict-map [(:id encoding) :strings])
13191319
nil
@@ -1322,11 +1322,13 @@ Dependent block frames are not supported!!")
13221322
(get-in encoding [:index-type :datatype]))
13231323
(native-buffer/->jvm-array 0 n-elems)
13241324
(dyn-int-list/make-from-container)))
1325-
(let [[offsets varchar-data] buffers]
1326-
(-> (offsets-data->string-reader (native-buffer/set-native-datatype
1327-
offsets offset-buf-dtype)
1328-
varchar-data n-elems)
1329-
(string-reader->text-reader)))))
1325+
(let [[offsets varchar-data] buffers
1326+
str-rdr (offsets-data->string-reader (native-buffer/set-native-datatype
1327+
offsets offset-buf-dtype)
1328+
varchar-data n-elems)]
1329+
(if-not (:text-as-strings? options)
1330+
(string-reader->text-reader)
1331+
str-rdr))))
13301332

13311333

13321334
(defn- int8-buf->missing
@@ -1415,7 +1417,8 @@ Dependent block frames are not supported!!")
14151417
dict-map encoding
14161418
(get-in field [:field-type :offset-buffer-datatype])
14171419
data-buffers
1418-
(:n-elems node))
1420+
(:n-elems node)
1421+
options)
14191422
(field-metadata field)
14201423
(node-buf->missing node validity-buf))))
14211424

0 commit comments

Comments
 (0)