File tree 3 files changed +28
-18
lines changed
3 files changed +28
-18
lines changed Original file line number Diff line number Diff line change 1
1
# Changelog
2
+ # 7.037
3
+ * Nippy loading is about 2x faster in the case of large string tables.
4
+ * Arrow read pathways support : text-as-strings ? to mirror : strings-as-text ? on the write side so you can save out uncompressed data in the fastest-to-read format.
5
+
2
6
# 7.036
3
7
* Major optimization (>9x!) loading of arrow files when large string tables/dictionaries are used.
4
8
Original file line number Diff line number Diff line change 918
918
string-data ^bytes string-data
919
919
offsets (dtype/->buffer offsets)
920
920
n-elems (dec (.lsize offsets))
921
- ^IMutList int->str
922
- (->> (dtype/make-reader
923
- :string n-elems
924
- (let [start-off (.readLong offsets idx)
925
- end-off (.readLong offsets (inc idx))]
926
- (String. string-data start-off (- end-off start-off))))
927
- (dtype/make-container :list :string ))
928
- str->int (HashMap. (dtype/ecount int->str))]
929
- (dotimes [idx n-elems]
930
- (.put str->int (.get int->str idx) idx))
931
- (StringTable. int->str str->int int-data))
921
+ str-rdr (dtype/make-reader
922
+ :string n-elems
923
+ (let [start-off (.readLong offsets idx)
924
+ end-off (.readLong offsets (inc idx))]
925
+ (String. string-data start-off (- end-off start-off))))
926
+ str-ary (hamf/object-array n-elems)]
927
+ (hamf/pgroups n-elems (fn [^long sidx ^long eidx]
928
+ (loop [idx sidx]
929
+ (when (< idx eidx)
930
+ (let [start-off (.readLong offsets idx)
931
+ end-off (.readLong offsets (inc idx))]
932
+ (aset str-ary idx (String. string-data start-off (- end-off start-off))))
933
+ (recur (inc idx))))))
934
+ (StringTable. (ham_fisted.ArrayLists/toList str-ary) nil int-data))
932
935
(= version 2 )
933
936
(let [^List int->str (dtype-list/wrap-container string-table)
934
937
str->int (HashMap. (dtype/ecount int->str))
Original file line number Diff line number Diff line change @@ -1313,7 +1313,7 @@ Dependent block frames are not supported!!")
1313
1313
1314
1314
1315
1315
(defn- string-data->column-data
1316
- [dict-map encoding offset-buf-dtype buffers n-elems]
1316
+ [dict-map encoding offset-buf-dtype buffers n-elems options ]
1317
1317
(if encoding
1318
1318
(StringTable. (get-in dict-map [(:id encoding) :strings ])
1319
1319
nil
@@ -1322,11 +1322,13 @@ Dependent block frames are not supported!!")
1322
1322
(get-in encoding [:index-type :datatype ]))
1323
1323
(native-buffer/->jvm-array 0 n-elems)
1324
1324
(dyn-int-list/make-from-container )))
1325
- (let [[offsets varchar-data] buffers]
1326
- (-> (offsets-data->string-reader (native-buffer/set-native-datatype
1327
- offsets offset-buf-dtype)
1328
- varchar-data n-elems)
1329
- (string-reader->text-reader )))))
1325
+ (let [[offsets varchar-data] buffers
1326
+ str-rdr (offsets-data->string-reader (native-buffer/set-native-datatype
1327
+ offsets offset-buf-dtype)
1328
+ varchar-data n-elems)]
1329
+ (if-not (:text-as-strings? options)
1330
+ (string-reader->text-reader )
1331
+ str-rdr))))
1330
1332
1331
1333
1332
1334
(defn- int8-buf->missing
@@ -1415,7 +1417,8 @@ Dependent block frames are not supported!!")
1415
1417
dict-map encoding
1416
1418
(get-in field [:field-type :offset-buffer-datatype ])
1417
1419
data-buffers
1418
- (:n-elems node))
1420
+ (:n-elems node)
1421
+ options)
1419
1422
(field-metadata field)
1420
1423
(node-buf->missing node validity-buf))))
1421
1424
You can’t perform that action at this time.
0 commit comments