Skip to content

Commit 807fda1

Browse files
committed
Adding ability to do use your own containers during parse time.
1 parent 4d75dd1 commit 807fda1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+147
-113
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions

deps.edn

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{:paths ["src" "resources" "target/classes"]
22
:deps {;;org.clojure/clojure {:mvn/version "1.11.1"}
3-
cnuernber/dtype-next {:mvn/version "10.130"}
3+
cnuernber/dtype-next {:mvn/version "10.131"}
44
techascent/tech.io {:mvn/version "4.31"
55
:exclusions [org.apache.commons/commons-compress]}
66
org.apache.datasketches/datasketches-java {:mvn/version "4.2.0"}
@@ -14,7 +14,7 @@
1414
:exec-fn codox.main/-main
1515
:exec-args {:group-id "techascent"
1616
:artifact-id "tech.ml.dataset"
17-
:version "7.042"
17+
:version "7.043"
1818
:name "TMD"
1919
:description "A Clojure high performance data processing system"
2020
:metadata {:doc/format :markdown}

docs/000-getting-started.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/100-walkthrough.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/200-quick-reference.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/columns-readers-and-datatypes.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/index.html

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

docs/nippy-serialization-rocks.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/supported-datatypes.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.categorical.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.clipboard.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.column-filters.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.column.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.csv.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.datetime.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.string-row-parser.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.univocity.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.join.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.math.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.metamorph.html

Lines changed: 42 additions & 41 deletions
Large diffs are not rendered by default.

docs/tech.v3.dataset.modelling.html

Lines changed: 3 additions & 2 deletions
Large diffs are not rendered by default.

docs/tech.v3.dataset.print.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.reductions.apache-data-sketch.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.reductions.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.rolling.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.set.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.tensor.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.zip.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.arrow.html

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

docs/tech.v3.libs.clj-transit.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.fastexcel.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.guava.cache.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.parquet.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.poi.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.tribuo.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/tech/v3/dataset/dynamic_int_list.clj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@
4545
(dtype-proto/->native-buffer backing-store))
4646
LongBuffer
4747
(elemwiseDatatype [_this] (dtype-proto/elemwise-datatype backing-store))
48+
(clear [this]
49+
(set! backing-store (abuf/as-growable-list (dtype/make-list :int8 1500) 0))
50+
(set! int-width 8))
4851
(lsize [_this] (.size backing-store))
4952
(size [_this] (.size backing-store))
5053
(subBuffer [this sidx eidx]

src/tech/v3/dataset/impl/column_base.clj

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,26 +73,28 @@
7373

7474
(defn make-container
7575
(^IMutList [dtype options]
76-
(case dtype
77-
:string (str-table/make-string-table 0 "")
78-
:text
79-
(let [^IMutList list-data
80-
(try
81-
(if (and (not= false (get options :text-temp-dir false))
82-
@file-backed-text-enabled*)
83-
(let [tmp-dir (:text-temp-dir options)]
84-
(file-backed-text/file-backed-text (merge
85-
{:suffix ".txt"}
86-
(when tmp-dir
87-
{:temp-dir tmp-dir}))))
88-
(dtype/make-list :text))
89-
(catch Throwable e
90-
(when-not @warn-atom*
91-
(reset! warn-atom* true)
92-
(log/warn e "File backed text failed. Falling back to in-memory"))
93-
(dtype/make-list :text)))]
94-
list-data)
95-
(dtype/make-list dtype)))
76+
(if-let [rv (get-in options [:datatype-parsers dtype])]
77+
(rv dtype options)
78+
(case dtype
79+
:string (str-table/make-string-table 0 "")
80+
:text
81+
(let [^IMutList list-data
82+
(try
83+
(if (and (not= false (get options :text-temp-dir false))
84+
@file-backed-text-enabled*)
85+
(let [tmp-dir (:text-temp-dir options)]
86+
(file-backed-text/file-backed-text (merge
87+
{:suffix ".txt"}
88+
(when tmp-dir
89+
{:temp-dir tmp-dir}))))
90+
(dtype/make-list :text))
91+
(catch Throwable e
92+
(when-not @warn-atom*
93+
(reset! warn-atom* true)
94+
(log/warn e "File backed text failed. Falling back to in-memory"))
95+
(dtype/make-list :text)))]
96+
list-data)
97+
(dtype/make-list dtype))))
9698
(^IMutList [dtype]
9799
(make-container dtype nil)))
98100

src/tech/v3/dataset/io/column_parsers.clj

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"Per-column parsers."
33
(:require [tech.v3.dataset.io.datetime :as parse-dt]
44
[tech.v3.dataset.impl.column-base :as column-base]
5+
[tech.v3.dataset.protocols :as ds-proto]
56
[tech.v3.datatype.packing :as packing]
67
[tech.v3.datatype :as dtype]
78
[tech.v3.datatype.casting :as casting]
@@ -162,7 +163,9 @@
162163
(merge
163164
#:tech.v3.dataset{:data (or (dtype/as-array-buffer container)
164165
(dtype/as-native-buffer container)
165-
container)
166+
(if (instance? clojure.lang.IDeref container)
167+
@container
168+
container))
166169
:missing missing
167170
:force-datatype? true}
168171
(when (and failed-values
@@ -496,6 +499,13 @@
496499
(.contains missing idx))
497500
nil
498501
(.get container idx))))
502+
ds-proto/PClearable
503+
(ds-clear [this]
504+
(.clear container)
505+
(.clear missing)
506+
(set! last-idx -1)
507+
(set! max-idx -1)
508+
(set! mc 0))
499509
PParser
500510
(addValue [_p idx value]
501511
(set! max-idx idx)

src/tech/v3/dataset/io/context.clj

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,8 @@
9090
colparser-compute-fn (reify Function
9191
(apply [this col-idx]
9292
(let [colname (col-idx->colname col-idx)
93-
colname (cond
94-
(number? colname)
95-
colname
96-
(empty? colname)
93+
colname (if (empty? colname)
9794
(make-colname col-idx)
98-
:else
9995
(utils/remove-zero-width-spaces colname))
10096
colname (if (and ensure-unique-column-names?
10197
(get colname->idx colname))

src/tech/v3/dataset/io/mapseq_colmap.clj

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
(:require [tech.v3.datatype :as dtype]
66
[tech.v3.dataset.io.column-parsers :as column-parsers]
77
[tech.v3.dataset.io.context :as parse-context]
8+
[tech.v3.dataset.protocols :as ds-proto]
89
[tech.v3.parallel.for :as pfor]
910
[tech.v3.datatype.argtypes :as argtypes]
1011
[tech.v3.dataset.impl.dataset :as ds-impl]
@@ -15,13 +16,15 @@
1516
[ham-fisted.protocols :as hamf-proto])
1617
(:import [java.util HashMap Map$Entry Map Map$Entry LinkedHashMap Iterator]
1718
[java.util.function Function Consumer]
18-
[tech.v3.dataset.protocols PDatasetParser]
19+
[tech.v3.dataset.protocols PDatasetParser PClearable]
1920
[clojure.lang IDeref Counted Indexed]
2021
[ham_fisted Reductions$IndexedAccum Reducible Consumers$IncConsumer
2122
ITypedReduce]))
2223

2324

24-
(defrecord ParseRecord [^long col-idx column-name column-parser])
25+
(defrecord ParseRecord [^long col-idx column-name column-parser]
26+
ds-proto/PClearable
27+
(ds-clear [this] (ds-proto/ds-clear column-parser)))
2528

2629

2730
(deftype ^:private MapseqReducer [options parsers consumer ^Consumers$IncConsumer row-idx]
@@ -59,7 +62,12 @@
5962
init (hamf/range (.value row-idx)))))
6063
IDeref
6164
(deref [this]
62-
(parse-context/parsers->dataset (assoc options :key-fn nil) parsers (.value row-idx))))
65+
(parse-context/parsers->dataset (assoc options :key-fn nil) parsers (.value row-idx)))
66+
PClearable
67+
(ds-clear [this]
68+
(reduce (fn [_ p] (ds-proto/ds-clear p)) nil (.values ^Map parsers))
69+
(.setValue row-idx 0)
70+
this))
6371

6472

6573
(defn mapseq-reducer

src/tech/v3/dataset/protocols.clj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@
4343
(add-rows [p rows]
4444
"rows need only be reducible"))
4545

46+
(defprotocol PClearable
47+
(ds-clear [p]
48+
"Reset to initial state. Avoid conflict with collection/clear"))
49+
4650

4751
(defprotocol PDatasetTransform
4852
(transform [t dataset]))

src/tech/v3/dataset/string_table.clj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
ObjectBuffer
4040
(elemwiseDatatype [_this] :string)
4141
(lsize [_this] (.size data))
42+
(clear [this]
43+
(.clear int->str)
44+
(.clear str->int)
45+
(.clear data))
4246
(size [_this] (.size data))
4347
(subBuffer [this sidx eidx]
4448
(ChunkedList/sublistCheck sidx eidx (.lsize this))

src/tech/v3/libs/arrow.clj

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1193,7 +1193,9 @@ Dependent block frames are not supported!!")
11931193

11941194
buf-entries (buffers->buf-entries buffers)
11951195
last-entry (last buf-entries)
1196-
body-len (pad (+ (long (last-entry :offset)) (long (last-entry :length))))
1196+
body-len (if last-entry
1197+
(pad (+ (long (last-entry :offset)) (long (last-entry :length))))
1198+
0)
11971199
builder (FlatBufferBuilder.)
11981200
msg-start (.getCurrentPosition writer)
11991201
_ (write-message-header writer

0 commit comments

Comments
 (0)