0.4.0 - remove compiled python, build for spark 2.2 - 3.0, update sbt & packages, update README

erikerlandson · erikerlandson · commit b400522774bd · 2020-06-19T18:02:21.000-07:00
diff --git a/README.md b/README.md
@@ -7,15 +7,24 @@ https://isarn.github.io/isarn-sketches-spark/latest/api/#org.isarnproject.sketch
 ## How to use in your project
 
 ``` scala
-// Note that the version of spark and python is part of the release name.
-// This example is for spark 2.2 and python 2.7:
-libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.3.1-sp2.2-py2.7"
+// Note that the version of spark is part of the release name.
+// This example is for spark 2.4:
+libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.4.0-sp2.4"
 ```
 
-** Currently supported: python 2.7, 3.6  X  spark 2.2, 2.3  X  scala 2.11 **
+Currently supported:
+
+- spark 2.2, scala 2.11
+- spark 2.3, scala 2.11
+- spark 2.4, scala 2.11 and 2.12
+- spark 3.0, scala 2.12
 
 If you are interested in a python/spark/scala build that is not listed above, please contact me and/or file an issue!
 
+Python code is also packaged with all of the artifacts above.
+Spark will automatically extract and compile Python components for use with PySpark.
+Python 2 and 3 are supported. Note that Python 2 is EOL as of January 2020.
+
 This package builds against some `% Provided` Apache Spark dependencies:
 ```scala
 libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion
@@ -26,9 +35,9 @@ libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion
 ## How to use from the Spark CLI
 Several Spark CLI tools accept the `--packages` argument, as with this `spark-shell` example:
 ```bash
-$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.11:0.3.1-sp2.3-py3.6"
+$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.12:0.4.0-sp2.4"
 ```
-Note that you need to explicitly include the scala version as part of the package name
+Note that you need to explicitly include the scala version as part of the package name.
 
 ## Examples
 
@@ -244,7 +253,7 @@ scala> td.show()
 >>> from isarnproject.sketches.udaf.tdigest import *
 >>> from random import gauss
 >>> from pyspark.sql.types import *
->>> data = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())]))
+>>> data = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())]))
 >>> agg = data.agg(tdigestDoubleUDAF("x"))
 >>> td = agg.first()[0]
 >>> td.cdfInverse(0.5)
@@ -257,10 +266,10 @@ scala> td.show()
 >>> from isarnproject.sketches.udaf.tdigest import *
 >>> from random import gauss
 >>> from pyspark.sql.types import *
->>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in xrange(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))]))
+>>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in range(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))]))
 >>> agg = data.agg(tdigestDoubleArrayUDAF("x"))
 >>> tds = agg.first()[0]
->>> [t.cdfInverse(0.5) for t in td] 
+>>> [t.cdfInverse(0.5) for t in tds] 
 [0.046116924117141189, -0.011071666930287466, -0.019006033872431105]
 >>> 
 ```
@@ -271,7 +280,7 @@ scala> td.show()
 >>> from random import gauss
 >>> from pyspark.ml.linalg import VectorUDT, Vectors
 >>> from pyspark.sql.types import *
->>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
+>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
 >>> agg = data.agg(tdigestMLVecUDAF("x"))
 >>> tds = agg.first()[0]
 >>> [t.cdfInverse(0.5) for t in tds]
@@ -285,7 +294,7 @@ scala> td.show()
 >>> from random import gauss
 >>> from pyspark.mllib.linalg import VectorUDT, Vectors
 >>> from pyspark.sql.types import *
->>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
+>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
 >>> agg = data.agg(tdigestMLLibVecUDAF("x"))
 >>> tds = agg.first()[0]
 >>> [t.cdfInverse(0.5) for t in tds]
@@ -298,8 +307,8 @@ scala> td.show()
 >>> from isarnproject.sketches.udaf.tdigest import *
 >>> from random import gauss
 >>> from pyspark.sql.types import *
->>> x = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())]))
->>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())]))
+>>> x = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())]))
+>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())]))
 >>> data = g.crossJoin(x)
 >>> tds = data.groupBy("g").agg(tdigestDoubleUDAF("x").alias("tdigests"))
 >>> tds.show()
@@ -330,8 +339,8 @@ scala> td.show()
 >>> from random import gauss
 >>> from pyspark.ml.linalg import VectorUDT, Vectors
 >>> from pyspark.sql.types import *
->>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
->>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())]))
+>>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
+>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())]))
 >>> data = g.crossJoin(x)
 >>> tds = data.groupBy("g").agg(tdigestMLVecUDAF("x").alias("tdigests"))
 >>> tds.show()
@@ -422,7 +431,7 @@ scala> imp.show
 >>> fiMod = fi.fit(training) \
 ...     .setTargetModel(lrModel) \
 ...     .setDeviationMeasure("rms-dev") \
-...     .setFeatureNames(["x%d" % (j) for j in xrange(10)])
+...     .setFeatureNames(["x%d" % (j) for j in range(10)])
 >>> imp = fiMod.transform(training)
 >>> imp.show()
 +----+-------------------+
diff --git a/build.sbt b/build.sbt
@@ -1,31 +1,26 @@
 // xsbt clean unidoc previewSite
 // xsbt clean unidoc ghpagesPushSite
-// xsbt -Dsbt.global.base=/home/eje/.sbt/sonatype +publish
-// make sure sparkVersion and pythonVersion are set as you want them prior to +publish
+// xsbt +publish
+// https://oss.sonatype.org
+// make sure sparkVersion is set as you want prior to +publish
 
 import scala.sys.process._
 
 name := "isarn-sketches-spark"
 
 organization := "org.isarnproject"
 
-val packageVersion = "0.3.1"
+val packageVersion = "0.4.0"
 
-val sparkVersion = "2.2.2"
-
-val pythonVersion = "2.7"
+val sparkVersion = "3.0.0"
 
 val sparkSuffix = s"""sp${sparkVersion.split('.').take(2).mkString(".")}"""
 
-val pythonSuffix = s"""py${pythonVersion.split('.').take(2).mkString(".")}"""
-
-val pythonCMD = s"""python${pythonVersion.split('.').head}"""
-
-version := s"${packageVersion}-${sparkSuffix}-${pythonSuffix}"
+version := s"${packageVersion}-${sparkSuffix}"
 
-scalaVersion := "2.11.12"
+scalaVersion := "2.12.11"
 
-crossScalaVersions := Seq("2.11.12") // scala 2.12 when spark supports it
+crossScalaVersions := Seq("2.12.11") // scala 2.12.11 when spark supports it
 
 pomIncludeRepository := { _ => false }
 
@@ -92,46 +87,15 @@ licenses += ("Apache-2.0", url("http://opensource.org/licenses/Apache-2.0"))
 
 scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
 
-lazy val deletePYC = taskKey[Unit]("Delete .pyc files")
-
-deletePYC := {
-  val s: TaskStreams = streams.value
-  s.log.info("delete .pyc files...")
-  val cmd = "bash" :: "-c" :: "rm -f $(find python -name *.pyc)" :: Nil
-  val stat = (cmd !)
-  if (stat == 0) {
-    s.log.info("delete .pyc succeeded")
-  } else {
-    throw new IllegalStateException("delete .pyc failed")
-  }
-}
-
-lazy val compilePython = taskKey[Unit]("Compile python files")
-
-compilePython := {
-  val s: TaskStreams = streams.value
-  s.log.info("compiling python...")
-  val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
-  if (stat == 0) {
-    s.log.info("python compile succeeded")
-  } else {
-    throw new IllegalStateException("python compile failed")
-  }
-}
-
-compilePython := (compilePython.dependsOn(deletePYC)).value
-
-(packageBin in Compile) := ((packageBin in Compile).dependsOn(compilePython)).value
-
 mappings in (Compile, packageBin) ++= Seq(
-  (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.pyc") -> "isarnproject/pipelines/__init__.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.pyc") -> "isarnproject/pipelines/fi.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc",
-  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc"
+  (baseDirectory.value / "python" / "isarnproject" / "__init__.py") -> "isarnproject/__init__.py",
+  (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.py") -> "isarnproject/pipelines/__init__.py",
+  (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.py") -> "isarnproject/pipelines/fi.py",
+  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.py") -> "isarnproject/sketches/__init__.py",
+  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.py") -> "isarnproject/sketches/udaf/__init__.py",
+  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.py") -> "isarnproject/sketches/udaf/tdigest.py",
+  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.py") -> "isarnproject/sketches/udt/__init__.py",
+  (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.py") -> "isarnproject/sketches/udt/tdigest.py"
 )
 
 test in assembly := {}
diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version=1.2.0
+sbt.version=1.3.12
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -7,13 +7,13 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori
 
 resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"
 
-addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2")
+addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")
 
-addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1")
+addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3")
 
-addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.1.6")
+addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1")
 
-addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
+addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2")
 
 // scoverage and coveralls deps are at old versions to avoid a bug in the current versions
 // update these when this fix is released:  https://github.com/scoverage/sbt-coveralls/issues/73

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-sbt.version=1.2.0`
	`1`	`+sbt.version=1.3.12`