@@ -7,15 +7,24 @@ https://isarn.github.io/isarn-sketches-spark/latest/api/#org.isarnproject.sketch
7
7
## How to use in your project
8
8
9
9
``` scala
10
- // Note that the version of spark and python is part of the release name.
11
- // This example is for spark 2.2 and python 2.7 :
12
- libraryDependencies += " org.isarnproject" %% " isarn-sketches-spark" % " 0.3.1 -sp2.2-py2.7 "
10
+ // Note that the version of spark is part of the release name.
11
+ // This example is for spark 2.4 :
12
+ libraryDependencies += " org.isarnproject" %% " isarn-sketches-spark" % " 0.4.0 -sp2.4 "
13
13
```
14
14
15
- ** Currently supported: python 2.7, 3.6 X spark 2.2, 2.3 X scala 2.11 **
15
+ Currently supported:
16
+
17
+ - spark 2.2, scala 2.11
18
+ - spark 2.3, scala 2.11
19
+ - spark 2.4, scala 2.11 and 2.12
20
+ - spark 3.0, scala 2.12
16
21
17
22
If you are interested in a python/spark/scala build that is not listed above, please contact me and/or file an issue!
18
23
24
+ Python code is also packaged with all of the artifacts above.
25
+ Spark will automatically extract and compile Python components for use with PySpark.
26
+ Python 2 and 3 are supported. Note that Python 2 is EOL as of January 2020.
27
+
19
28
This package builds against some ` % Provided ` Apache Spark dependencies:
20
29
``` scala
21
30
libraryDependencies += " org.apache.spark" %% " spark-core" % sparkVersion
@@ -26,9 +35,9 @@ libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion
26
35
## How to use from the Spark CLI
27
36
Several Spark CLI tools accept the ` --packages ` argument, as with this ` spark-shell ` example:
28
37
``` bash
29
- $ spark-shell --packages " org.isarnproject:isarn-sketches-spark_2.11 :0.3.1 -sp2.3-py3.6 "
38
+ $ spark-shell --packages " org.isarnproject:isarn-sketches-spark_2.12 :0.4.0 -sp2.4 "
30
39
```
31
- Note that you need to explicitly include the scala version as part of the package name
40
+ Note that you need to explicitly include the scala version as part of the package name.
32
41
33
42
## Examples
34
43
@@ -244,7 +253,7 @@ scala> td.show()
244
253
>> > from isarnproject.sketches.udaf.tdigest import *
245
254
>> > from random import gauss
246
255
>> > from pyspark.sql.types import *
247
- >> > data = sc.parallelize([[gauss(0 ,1 )] for x in xrange (1000 )]).toDF(StructType([StructField(" x" , DoubleType())]))
256
+ >> > data = sc.parallelize([[gauss(0 ,1 )] for x in range (1000 )]).toDF(StructType([StructField(" x" , DoubleType())]))
248
257
>> > agg = data.agg(tdigestDoubleUDAF(" x" ))
249
258
>> > td = agg.first()[0 ]
250
259
>> > td.cdfInverse(0.5 )
@@ -257,10 +266,10 @@ scala> td.show()
257
266
>> > from isarnproject.sketches.udaf.tdigest import *
258
267
>> > from random import gauss
259
268
>> > from pyspark.sql.types import *
260
- >> > data = sc.parallelize([[[gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )]] for x in xrange (1000 )]).toDF(StructType([StructField(" x" , ArrayType(DoubleType()))]))
269
+ >> > data = sc.parallelize([[[gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )]] for x in range (1000 )]).toDF(StructType([StructField(" x" , ArrayType(DoubleType()))]))
261
270
>> > agg = data.agg(tdigestDoubleArrayUDAF(" x" ))
262
271
>> > tds = agg.first()[0 ]
263
- >> > [t.cdfInverse(0.5 ) for t in td ]
272
+ >> > [t.cdfInverse(0.5 ) for t in tds ]
264
273
[0.046116924117141189 , - 0.011071666930287466 , - 0.019006033872431105 ]
265
274
>> >
266
275
```
@@ -271,7 +280,7 @@ scala> td.show()
271
280
>> > from random import gauss
272
281
>> > from pyspark.ml.linalg import VectorUDT, Vectors
273
282
>> > from pyspark.sql.types import *
274
- >> > data = sc.parallelize([[Vectors.dense([gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )])] for x in xrange (1000 )]).toDF(StructType([StructField(" x" , VectorUDT())]))
283
+ >> > data = sc.parallelize([[Vectors.dense([gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )])] for x in range (1000 )]).toDF(StructType([StructField(" x" , VectorUDT())]))
275
284
>> > agg = data.agg(tdigestMLVecUDAF(" x" ))
276
285
>> > tds = agg.first()[0 ]
277
286
>> > [t.cdfInverse(0.5 ) for t in tds]
@@ -285,7 +294,7 @@ scala> td.show()
285
294
>> > from random import gauss
286
295
>> > from pyspark.mllib.linalg import VectorUDT, Vectors
287
296
>> > from pyspark.sql.types import *
288
- >> > data = sc.parallelize([[Vectors.dense([gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )])] for x in xrange (1000 )]).toDF(StructType([StructField(" x" , VectorUDT())]))
297
+ >> > data = sc.parallelize([[Vectors.dense([gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )])] for x in range (1000 )]).toDF(StructType([StructField(" x" , VectorUDT())]))
289
298
>> > agg = data.agg(tdigestMLLibVecUDAF(" x" ))
290
299
>> > tds = agg.first()[0 ]
291
300
>> > [t.cdfInverse(0.5 ) for t in tds]
@@ -298,8 +307,8 @@ scala> td.show()
298
307
>> > from isarnproject.sketches.udaf.tdigest import *
299
308
>> > from random import gauss
300
309
>> > from pyspark.sql.types import *
301
- >> > x = sc.parallelize([[gauss(0 ,1 )] for x in xrange (1000 )]).toDF(StructType([StructField(" x" , DoubleType())]))
302
- >> > g = sc.parallelize([[1 + x] for x in xrange (5 )]).toDF(StructType([StructField(" g" , IntegerType())]))
310
+ >> > x = sc.parallelize([[gauss(0 ,1 )] for x in range (1000 )]).toDF(StructType([StructField(" x" , DoubleType())]))
311
+ >> > g = sc.parallelize([[1 + x] for x in range (5 )]).toDF(StructType([StructField(" g" , IntegerType())]))
303
312
>> > data = g.crossJoin(x)
304
313
>> > tds = data.groupBy(" g" ).agg(tdigestDoubleUDAF(" x" ).alias(" tdigests" ))
305
314
>> > tds.show()
@@ -330,8 +339,8 @@ scala> td.show()
330
339
>> > from random import gauss
331
340
>> > from pyspark.ml.linalg import VectorUDT, Vectors
332
341
>> > from pyspark.sql.types import *
333
- >> > x = sc.parallelize([[Vectors.dense([gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )])] for x in xrange (1000 )]).toDF(StructType([StructField(" x" , VectorUDT())]))
334
- >> > g = sc.parallelize([[1 + x] for x in xrange (5 )]).toDF(StructType([StructField(" g" , IntegerType())]))
342
+ >> > x = sc.parallelize([[Vectors.dense([gauss(0 ,1 ),gauss(0 ,1 ),gauss(0 ,1 )])] for x in range (1000 )]).toDF(StructType([StructField(" x" , VectorUDT())]))
343
+ >> > g = sc.parallelize([[1 + x] for x in range (5 )]).toDF(StructType([StructField(" g" , IntegerType())]))
335
344
>> > data = g.crossJoin(x)
336
345
>> > tds = data.groupBy(" g" ).agg(tdigestMLVecUDAF(" x" ).alias(" tdigests" ))
337
346
>> > tds.show()
@@ -422,7 +431,7 @@ scala> imp.show
422
431
>> > fiMod = fi.fit(training) \
423
432
... .setTargetModel(lrModel) \
424
433
... .setDeviationMeasure(" rms-dev" ) \
425
- ... .setFeatureNames([" x%d " % (j) for j in xrange (10 )])
434
+ ... .setFeatureNames([" x%d " % (j) for j in range (10 )])
426
435
>> > imp = fiMod.transform(training)
427
436
>> > imp.show()
428
437
+ ---- + ------------------ -+
0 commit comments