Skip to content

Commit b9420da

Browse files
committed
Release 7.049
1 parent 69d62ee commit b9420da

38 files changed

+120
-45
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# Changelog
2+
# 7.049
3+
* Optimizations to string table clone, string table create and arrow serialization.
4+
25
# 7.047
36
* hamf bugfix for update-values.
47

deps.edn

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
:exec-fn codox.main/-main
1515
:exec-args {:group-id "techascent"
1616
:artifact-id "tech.ml.dataset"
17-
:version "7.048"
17+
:version "7.049"
1818
:name "TMD"
1919
:description "A Clojure high performance data processing system"
2020
:metadata {:doc/format :markdown}

docs/000-getting-started.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/100-walkthrough.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/200-quick-reference.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/columns-readers-and-datatypes.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/index.html

Lines changed: 2 additions & 2 deletions
Large diffs are not rendered by default.

docs/nippy-serialization-rocks.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/supported-datatypes.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.categorical.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.clipboard.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.column-filters.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.column.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.csv.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.datetime.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.string-row-parser.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.univocity.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.join.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.math.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.metamorph.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.modelling.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.print.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.reductions.apache-data-sketch.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.reductions.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.rolling.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.set.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.tensor.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.zip.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.arrow.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.clj-transit.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.fastexcel.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.guava.cache.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.parquet.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.poi.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.tribuo.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/tech/v3/dataset/string_table.clj

Lines changed: 79 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
[tech.v3.parallel.for :as parallel-for]
77
[tech.v3.datatype.errors :as errors]
88
[ham-fisted.api :as hamf]
9-
[ham-fisted.reduce :as hamf-rf])
9+
[ham-fisted.reduce :as hamf-rf]
10+
[clojure.tools.logging :as log])
1011
(:import [java.util List HashMap Map ArrayList]
1112
[java.util.function Function]
1213
[tech.v3.datatype ObjectBuffer Buffer]
13-
[ham_fisted IMutList ChunkedList Casts]))
14+
[ham_fisted IMutList ChunkedList Casts ArrayHelpers ArrayLists]))
1415

1516

1617
(set! *warn-on-reflection* true)
@@ -32,7 +33,16 @@
3233
(clone [this]
3334
;;We do not need to dedup any more; a java array is a more efficient
3435
;;storage mechanism
35-
(dtype/make-container :jvm-heap :string this))
36+
(let [sz (.size this)
37+
^objects rv (make-array String sz)
38+
local-int->str int->str
39+
local-data data]
40+
(hamf/pgroups sz (fn string-table-clone [^long sidx ^long eidx]
41+
(loop [sidx sidx]
42+
(when (< sidx eidx)
43+
(ArrayHelpers/aset rv sidx (.get int->str (.getLong local-data sidx)))
44+
(recur (inc sidx))))))
45+
(ArrayLists/toList rv)))
3646
PStrTable
3747
(get-str-table [_this] {:int->str int->str
3848
:str->int str->int})
@@ -123,18 +133,77 @@
123133
(^Buffer []
124134
(make-string-table 0 "" (hamf/object-array-list) (HashMap.))))
125135

136+
(defn compress-indexes
137+
^IMutList [^IMutList indexes ^long max-idx]
138+
(cond
139+
(<= max-idx Byte/MAX_VALUE) (ArrayLists/toList (hamf/byte-array indexes))
140+
(<= max-idx Short/MAX_VALUE) (ArrayLists/toList (hamf/short-array indexes))
141+
(<= max-idx Integer/MAX_VALUE) (ArrayLists/toList (hamf/int-array indexes))
142+
:else (.toLongArray indexes)))
143+
144+
(definterface IDof
145+
(idOf ^long [s]))
146+
147+
(defn fast-str
148+
^String [s]
149+
(cond
150+
(nil? s) ""
151+
(instance? String s) s
152+
:else (.toString ^Object s)))
153+
154+
(deftype FastStringContainer [^IMutList indexes ^List int->str ^HashMap str->int]
155+
IDof
156+
(idOf [this s]
157+
(let [s (fast-str s)
158+
sz (long (.size str->int))
159+
lookup (.putIfAbsent str->int s sz)]
160+
(if lookup
161+
lookup
162+
(do
163+
(.add int->str s)
164+
sz))))
165+
java.util.function.Consumer
166+
(accept [this v] (.add this v))
167+
IMutList
168+
(add [this v]
169+
(.addLong indexes (.idOf this v))
170+
true)
171+
(add [this idx ct v]
172+
(.add indexes (.size indexes) ct (.idOf this v)))
173+
(get [this idx]
174+
(let [rv (.get int->str (.getLong indexes idx))]
175+
(when (nil? rv)
176+
(throw (RuntimeException. (str "Index out of range: " idx))))
177+
rv))
178+
(size [this] (.size indexes))
179+
(clear [this]
180+
(.clear indexes)
181+
(.clear int->str)
182+
(.clear str->int))
183+
tech.v3.datatype.protocols/PElemwiseDatatype
184+
(elemwise-datatype [this] :string)
185+
clojure.lang.IDeref
186+
(deref [this]
187+
(StringTable. (hamf/vec int->str) (.clone str->int)
188+
(compress-indexes indexes (long (.size str->int))))))
189+
190+
(defn fast-string-container []
191+
(let [str->int (hamf/java-hashmap)
192+
int->str (ArrayList.)
193+
_ (do (.put str->int "" 0)
194+
(.add int->str ""))]
195+
(FastStringContainer. (hamf/long-array-list)
196+
int->str str->int)))
126197

127198
(defn string-table-from-strings
128199
[str-data]
129-
(let [n-elems (long (or (hamf/constant-count str-data) 0))]
130-
(doto (make-string-table n-elems)
131-
(.addAllReducible str-data))))
200+
(hamf-rf/reduce-reducer (hamf-rf/consumer-reducer fast-string-container) str-data))
132201

133202

134203
(defn ->string-table
135204
^StringTable [str-t]
136205
(errors/when-not-errorf (instance? StringTable str-t)
137-
"string table is wrong type: %s" str-t)
206+
"string table is wrong type: %s" str-t)
138207
str-t)
139208

140209

@@ -151,3 +220,6 @@
151220
^List [^StringTable str-t]
152221
(-> (->string-table str-t)
153222
(.int->str)))
223+
224+
225+

src/tech/v3/libs/arrow.clj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ Dependent block frames are not supported!!")
590590
(throw (Exception. "Invalid string table - missing entries.")))
591591
str-bytes (.getBytes (str strdata))
592592
soff (dtype/ecount byte-data)]
593-
(.addAll byte-data (dtype/->reader str-bytes))
593+
(.addAllReducible byte-data (ArrayLists/toList str-bytes))
594594
(.add offsets soff)))
595595
(let [prev-int->str (str-table/int->string prev-str-t)
596596
start-offset (dtype/ecount prev-int->str)
@@ -602,7 +602,7 @@ Dependent block frames are not supported!!")
602602
(throw (Exception. "Invalid string table - missing entries.")))
603603
str-bytes (.getBytes (str strdata))
604604
soff (dtype/ecount byte-data)]
605-
(.addAll byte-data (dtype/->reader str-bytes))
605+
(.addAllReducible byte-data (ArrayLists/toList str-bytes))
606606
(.add offsets soff)))))
607607
;;Make everyone's life easier by adding an extra offset.
608608
(.add offsets (dtype/ecount byte-data))

0 commit comments

Comments
 (0)