Skip to content

Commit c95bb4f

Browse files
committed
Release 7.026
1 parent d44f27c commit c95bb4f

40 files changed

+223
-186
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# Changelog
2+
# 7.036
3+
* Major optimization (>9x!) loading of arrow files when large string tables/dictionaries are used.
4+
25
# 7.035
36
* Latest dtype-next (10.124) - contains upgrades to ham-fisted which allow pmap et al. to accept arbitrary executor services.
47
* Fix for [issue 438](https://github.com/techascent/tech.ml.dataset/issues/438) - keyword dataset names in tribuo.

deps.edn

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{:paths ["src" "resources" "target/classes"]
22
:deps {;;org.clojure/clojure {:mvn/version "1.11.1"}
3-
cnuernber/dtype-next {:mvn/version "10.124"}
3+
cnuernber/dtype-next {:mvn/version "10.125"}
44
techascent/tech.io {:mvn/version "4.31"
55
:exclusions [org.apache.commons/commons-compress]}
66
org.apache.datasketches/datasketches-java {:mvn/version "4.2.0"}
@@ -14,7 +14,7 @@
1414
:exec-fn codox.main/-main
1515
:exec-args {:group-id "techascent"
1616
:artifact-id "tech.ml.dataset"
17-
:version "7.035"
17+
:version "7.036"
1818
:name "TMD"
1919
:description "A Clojure high performance data processing system"
2020
:metadata {:doc/format :markdown}

docs/000-getting-started.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/100-walkthrough.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/200-quick-reference.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/columns-readers-and-datatypes.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/index.html

Lines changed: 4 additions & 4 deletions
Large diffs are not rendered by default.

docs/nippy-serialization-rocks.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/supported-datatypes.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.categorical.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.clipboard.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.column-filters.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.column.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.html

Lines changed: 60 additions & 58 deletions
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.csv.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.datetime.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.string-row-parser.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.io.univocity.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.join.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.math.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.metamorph.html

Lines changed: 69 additions & 67 deletions
Large diffs are not rendered by default.

docs/tech.v3.dataset.modelling.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.print.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.reductions.apache-data-sketch.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.reductions.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.rolling.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.set.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.tensor.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.dataset.zip.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.arrow.html

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

docs/tech.v3.libs.clj-transit.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.fastexcel.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.guava.cache.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.parquet.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.poi.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.smile.data.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docs/tech.v3.libs.tribuo.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/tech/v3/dataset.clj

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,12 @@ null [6 3]:
580580
(tech.v3.dataset.base/drop-rows dataset-or-col row-indexes)))
581581

582582

583+
(defn empty-column-names
584+
"Return a sequence of column names whose empty set length matches the row count of the dataset."
585+
([ds]
586+
(tech.v3.dataset-api/empty-column-names ds)))
587+
588+
583589
(defn empty-dataset
584590
([]
585591
(tech.v3.dataset.impl.dataset/empty-dataset )))
@@ -974,6 +980,12 @@ test/data/stocks.csv [10 3]:
974980
(tech.v3.dataset.base/remove-columns dataset colname-seq-or-fn)))
975981

976982

983+
(defn remove-empty-columns
984+
"Remove all columns that have no data - missing set length equals row count."
985+
([ds]
986+
(tech.v3.dataset-api/remove-empty-columns ds)))
987+
988+
977989
(defn remove-rows
978990
"Same as drop-rows."
979991
([dataset-or-col row-indexes]

src/tech/v3/dataset/metamorph.clj

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,12 @@ null [6 3]:
420420
(tech.v3.dataset.metamorph-api/drop-rows row-indexes)))
421421

422422

423+
(defn empty-column-names
424+
"Return a sequence of column names whose empty set length matches the row count of the dataset."
425+
([]
426+
(tech.v3.dataset.metamorph-api/empty-column-names )))
427+
428+
423429
(defn empty-dataset
424430
([]
425431
(tech.v3.dataset.metamorph-api/empty-dataset )))
@@ -839,6 +845,12 @@ test/data/stocks.csv [10 3]:
839845
(tech.v3.dataset.metamorph-api/remove-columns colname-seq-or-fn)))
840846

841847

848+
(defn remove-empty-columns
849+
"Remove all columns that have no data - missing set length equals row count."
850+
([]
851+
(tech.v3.dataset.metamorph-api/remove-empty-columns )))
852+
853+
842854
(defn remove-rows
843855
"Same as drop-rows."
844856
([row-indexes]

src/tech/v3/libs/arrow.clj

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,16 +1244,24 @@ Dependent block frames are not supported!!")
12441244
^List [offsets data n-elems]
12451245
(let [n-elems (long n-elems)
12461246
offsets (dtype/->reader offsets)]
1247-
(reify ObjectReader
1248-
(elemwiseDatatype [rdr] :string)
1249-
(lsize [rdr] n-elems)
1250-
(readObject [rdr idx]
1251-
(let [start-off (long (offsets idx))
1252-
end-off (long (offsets (inc idx)))]
1253-
(-> (dtype/sub-buffer data start-off
1254-
(- end-off start-off))
1255-
(dtype/->byte-array)
1256-
(String.)))))))
1247+
(if (instance? NativeBuffer data)
1248+
(reify ObjectReader
1249+
(elemwiseDatatype [rdr] :string)
1250+
(lsize [rdr] n-elems)
1251+
(readObject [rdr idx]
1252+
(let [start-off (long (offsets idx))
1253+
end-off (long (offsets (inc idx)))]
1254+
(native-buffer/native-buffer->string data start-off (- end-off start-off)))))
1255+
(reify ObjectReader
1256+
(elemwiseDatatype [rdr] :string)
1257+
(lsize [rdr] n-elems)
1258+
(readObject [rdr idx]
1259+
(let [start-off (long (offsets idx))
1260+
end-off (long (offsets (inc idx)))]
1261+
(-> (dtype/sub-buffer data start-off
1262+
(- end-off start-off))
1263+
(dtype/->byte-array)
1264+
(String.))))))))
12571265

12581266
(defn- offsets-data->bytedata-reader
12591267
^List [offsets data n-elems]
@@ -1306,15 +1314,13 @@ Dependent block frames are not supported!!")
13061314
(defn- string-data->column-data
13071315
[dict-map encoding offset-buf-dtype buffers n-elems]
13081316
(if encoding
1309-
(let [str-list (get-in dict-map [(:id encoding) :strings])
1310-
index-data (-> (first buffers)
1311-
(native-buffer/set-native-datatype
1312-
(get-in encoding [:index-type :datatype]))
1313-
(dtype/sub-buffer 0 n-elems)
1314-
(dtype/clone))
1315-
retval (StringTable. str-list nil (dyn-int-list/make-from-container
1316-
index-data))]
1317-
retval)
1317+
(StringTable. (get-in dict-map [(:id encoding) :strings])
1318+
nil
1319+
(-> (first buffers)
1320+
(native-buffer/set-native-datatype
1321+
(get-in encoding [:index-type :datatype]))
1322+
(native-buffer/->jvm-array 0 n-elems)
1323+
(dyn-int-list/make-from-container)))
13181324
(let [[offsets varchar-data] buffers]
13191325
(-> (offsets-data->string-reader (native-buffer/set-native-datatype
13201326
offsets offset-buf-dtype)

0 commit comments

Comments
 (0)