Skip to content

Commit b400522

Browse files
committed
0.4.0 - remove compiled python, build for spark 2.2 - 3.0, update sbt & packages, update README
1 parent 62b6193 commit b400522

File tree

4 files changed

+46
-73
lines changed

4 files changed

+46
-73
lines changed

README.md

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,24 @@ https://isarn.github.io/isarn-sketches-spark/latest/api/#org.isarnproject.sketch
77
## How to use in your project
88

99
``` scala
10-
// Note that the version of spark and python is part of the release name.
11-
// This example is for spark 2.2 and python 2.7:
12-
libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.3.1-sp2.2-py2.7"
10+
// Note that the version of spark is part of the release name.
11+
// This example is for spark 2.4:
12+
libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.4.0-sp2.4"
1313
```
1414

15-
** Currently supported: python 2.7, 3.6 X spark 2.2, 2.3 X scala 2.11 **
15+
Currently supported:
16+
17+
- spark 2.2, scala 2.11
18+
- spark 2.3, scala 2.11
19+
- spark 2.4, scala 2.11 and 2.12
20+
- spark 3.0, scala 2.12
1621

1722
If you are interested in a python/spark/scala build that is not listed above, please contact me and/or file an issue!
1823

24+
Python code is also packaged with all of the artifacts above.
25+
Spark will automatically extract and compile Python components for use with PySpark.
26+
Python 2 and 3 are supported. Note that Python 2 is EOL as of January 2020.
27+
1928
This package builds against some `% Provided` Apache Spark dependencies:
2029
```scala
2130
libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion
@@ -26,9 +35,9 @@ libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion
2635
## How to use from the Spark CLI
2736
Several Spark CLI tools accept the `--packages` argument, as with this `spark-shell` example:
2837
```bash
29-
$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.11:0.3.1-sp2.3-py3.6"
38+
$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.12:0.4.0-sp2.4"
3039
```
31-
Note that you need to explicitly include the scala version as part of the package name
40+
Note that you need to explicitly include the scala version as part of the package name.
3241

3342
## Examples
3443

@@ -244,7 +253,7 @@ scala> td.show()
244253
>>> from isarnproject.sketches.udaf.tdigest import *
245254
>>> from random import gauss
246255
>>> from pyspark.sql.types import *
247-
>>> data = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())]))
256+
>>> data = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())]))
248257
>>> agg = data.agg(tdigestDoubleUDAF("x"))
249258
>>> td = agg.first()[0]
250259
>>> td.cdfInverse(0.5)
@@ -257,10 +266,10 @@ scala> td.show()
257266
>>> from isarnproject.sketches.udaf.tdigest import *
258267
>>> from random import gauss
259268
>>> from pyspark.sql.types import *
260-
>>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in xrange(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))]))
269+
>>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in range(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))]))
261270
>>> agg = data.agg(tdigestDoubleArrayUDAF("x"))
262271
>>> tds = agg.first()[0]
263-
>>> [t.cdfInverse(0.5) for t in td]
272+
>>> [t.cdfInverse(0.5) for t in tds]
264273
[0.046116924117141189, -0.011071666930287466, -0.019006033872431105]
265274
>>>
266275
```
@@ -271,7 +280,7 @@ scala> td.show()
271280
>>> from random import gauss
272281
>>> from pyspark.ml.linalg import VectorUDT, Vectors
273282
>>> from pyspark.sql.types import *
274-
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
283+
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
275284
>>> agg = data.agg(tdigestMLVecUDAF("x"))
276285
>>> tds = agg.first()[0]
277286
>>> [t.cdfInverse(0.5) for t in tds]
@@ -285,7 +294,7 @@ scala> td.show()
285294
>>> from random import gauss
286295
>>> from pyspark.mllib.linalg import VectorUDT, Vectors
287296
>>> from pyspark.sql.types import *
288-
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
297+
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
289298
>>> agg = data.agg(tdigestMLLibVecUDAF("x"))
290299
>>> tds = agg.first()[0]
291300
>>> [t.cdfInverse(0.5) for t in tds]
@@ -298,8 +307,8 @@ scala> td.show()
298307
>>> from isarnproject.sketches.udaf.tdigest import *
299308
>>> from random import gauss
300309
>>> from pyspark.sql.types import *
301-
>>> x = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())]))
302-
>>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())]))
310+
>>> x = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())]))
311+
>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())]))
303312
>>> data = g.crossJoin(x)
304313
>>> tds = data.groupBy("g").agg(tdigestDoubleUDAF("x").alias("tdigests"))
305314
>>> tds.show()
@@ -330,8 +339,8 @@ scala> td.show()
330339
>>> from random import gauss
331340
>>> from pyspark.ml.linalg import VectorUDT, Vectors
332341
>>> from pyspark.sql.types import *
333-
>>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
334-
>>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())]))
342+
>>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
343+
>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())]))
335344
>>> data = g.crossJoin(x)
336345
>>> tds = data.groupBy("g").agg(tdigestMLVecUDAF("x").alias("tdigests"))
337346
>>> tds.show()
@@ -422,7 +431,7 @@ scala> imp.show
422431
>>> fiMod = fi.fit(training) \
423432
... .setTargetModel(lrModel) \
424433
... .setDeviationMeasure("rms-dev") \
425-
... .setFeatureNames(["x%d" % (j) for j in xrange(10)])
434+
... .setFeatureNames(["x%d" % (j) for j in range(10)])
426435
>>> imp = fiMod.transform(training)
427436
>>> imp.show()
428437
+----+-------------------+

build.sbt

Lines changed: 16 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,26 @@
11
// xsbt clean unidoc previewSite
22
// xsbt clean unidoc ghpagesPushSite
3-
// xsbt -Dsbt.global.base=/home/eje/.sbt/sonatype +publish
4-
// make sure sparkVersion and pythonVersion are set as you want them prior to +publish
3+
// xsbt +publish
4+
// https://oss.sonatype.org
5+
// make sure sparkVersion is set as you want prior to +publish
56

67
import scala.sys.process._
78

89
name := "isarn-sketches-spark"
910

1011
organization := "org.isarnproject"
1112

12-
val packageVersion = "0.3.1"
13+
val packageVersion = "0.4.0"
1314

14-
val sparkVersion = "2.2.2"
15-
16-
val pythonVersion = "2.7"
15+
val sparkVersion = "3.0.0"
1716

1817
val sparkSuffix = s"""sp${sparkVersion.split('.').take(2).mkString(".")}"""
1918

20-
val pythonSuffix = s"""py${pythonVersion.split('.').take(2).mkString(".")}"""
21-
22-
val pythonCMD = s"""python${pythonVersion.split('.').head}"""
23-
24-
version := s"${packageVersion}-${sparkSuffix}-${pythonSuffix}"
19+
version := s"${packageVersion}-${sparkSuffix}"
2520

26-
scalaVersion := "2.11.12"
21+
scalaVersion := "2.12.11"
2722

28-
crossScalaVersions := Seq("2.11.12") // scala 2.12 when spark supports it
23+
crossScalaVersions := Seq("2.12.11") // scala 2.12.11 when spark supports it
2924

3025
pomIncludeRepository := { _ => false }
3126

@@ -92,46 +87,15 @@ licenses += ("Apache-2.0", url("http://opensource.org/licenses/Apache-2.0"))
9287

9388
scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
9489

95-
lazy val deletePYC = taskKey[Unit]("Delete .pyc files")
96-
97-
deletePYC := {
98-
val s: TaskStreams = streams.value
99-
s.log.info("delete .pyc files...")
100-
val cmd = "bash" :: "-c" :: "rm -f $(find python -name *.pyc)" :: Nil
101-
val stat = (cmd !)
102-
if (stat == 0) {
103-
s.log.info("delete .pyc succeeded")
104-
} else {
105-
throw new IllegalStateException("delete .pyc failed")
106-
}
107-
}
108-
109-
lazy val compilePython = taskKey[Unit]("Compile python files")
110-
111-
compilePython := {
112-
val s: TaskStreams = streams.value
113-
s.log.info("compiling python...")
114-
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
115-
if (stat == 0) {
116-
s.log.info("python compile succeeded")
117-
} else {
118-
throw new IllegalStateException("python compile failed")
119-
}
120-
}
121-
122-
compilePython := (compilePython.dependsOn(deletePYC)).value
123-
124-
(packageBin in Compile) := ((packageBin in Compile).dependsOn(compilePython)).value
125-
12690
mappings in (Compile, packageBin) ++= Seq(
127-
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc",
128-
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.pyc") -> "isarnproject/pipelines/__init__.pyc",
129-
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.pyc") -> "isarnproject/pipelines/fi.pyc",
130-
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc",
131-
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc",
132-
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc",
133-
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc",
134-
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc"
91+
(baseDirectory.value / "python" / "isarnproject" / "__init__.py") -> "isarnproject/__init__.py",
92+
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.py") -> "isarnproject/pipelines/__init__.py",
93+
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.py") -> "isarnproject/pipelines/fi.py",
94+
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.py") -> "isarnproject/sketches/__init__.py",
95+
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.py") -> "isarnproject/sketches/udaf/__init__.py",
96+
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.py") -> "isarnproject/sketches/udaf/tdigest.py",
97+
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.py") -> "isarnproject/sketches/udt/__init__.py",
98+
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.py") -> "isarnproject/sketches/udt/tdigest.py"
13599
)
136100

137101
test in assembly := {}

project/build.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
sbt.version=1.2.0
1+
sbt.version=1.3.12

project/plugins.sbt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori
77

88
resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"
99

10-
addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2")
10+
addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")
1111

12-
addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1")
12+
addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3")
1313

14-
addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.1.6")
14+
addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1")
1515

16-
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
16+
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2")
1717

1818
// scoverage and coveralls deps are at old versions to avoid a bug in the current versions
1919
// update these when this fix is released: https://github.com/scoverage/sbt-coveralls/issues/73

0 commit comments

Comments
 (0)