deepsense-ai
diff --git a/‎build/build_and_run_tests.sh
+1 b/‎build/build_and_run_tests.sh
+1
diff --git a/‎build/workflow-executor_build.sh
+1 b/‎build/workflow-executor_build.sh
+1
diff --git a/‎deployment/spark-docker/Dockerfile
+1-1 b/‎deployment/spark-docker/Dockerfile
+1-1
diff --git a/‎project/Dependencies.scala
+1 b/‎project/Dependencies.scala
+1
diff --git a/‎remote_notebook/code/sparkr_kernel/kernel_init.R
+1-1 b/‎remote_notebook/code/sparkr_kernel/kernel_init.R
+1-1
diff --git a/‎seahorse-workflow-executor/build.sbt
+30-2 b/‎seahorse-workflow-executor/build.sbt
+30-2
diff --git a/‎seahorse-workflow-executor/build/build_and_run_tests.sh
+4-4 b/‎seahorse-workflow-executor/build/build_and_run_tests.sh
+4-4
diff --git a/‎seahorse-workflow-executor/deeplang/src/main/scala/ai/deepsense/deeplang/doperations/readwritedataframe/filestorage/DriverFiles.scala
+9-5 b/‎seahorse-workflow-executor/deeplang/src/main/scala/ai/deepsense/deeplang/doperations/readwritedataframe/filestorage/DriverFiles.scala
+9-5
diff --git a/‎seahorse-workflow-executor/docker/spark-standalone-cluster-manage.sh
+1-1 b/‎seahorse-workflow-executor/docker/spark-standalone-cluster-manage.sh
+1-1
diff --git a/‎seahorse-workflow-executor/project/Dependencies.scala
+1 b/‎seahorse-workflow-executor/project/Dependencies.scala
+1
diff --git a/‎seahorse-workflow-executor/python/pyexecutor/code_executor.py
+1-1 b/‎seahorse-workflow-executor/python/pyexecutor/code_executor.py
+1-1
diff --git a/‎seahorse-workflow-executor/python/pyexecutor/pyexecutor.py
+1-1 b/‎seahorse-workflow-executor/python/pyexecutor/pyexecutor.py
+1-1
diff --git a/‎seahorse-workflow-executor/scalastyle-config.xml
+1-1 b/‎seahorse-workflow-executor/scalastyle-config.xml
+1-1
diff --git a/‎seahorse-workflow-executor/sparkutils2.1.x/src/main/scala/ai/deepsense/sparkutils/readwritedataframe/SparkCsvReader.scala
+6-7 b/‎seahorse-workflow-executor/sparkutils2.1.x/src/main/scala/ai/deepsense/sparkutils/readwritedataframe/SparkCsvReader.scala
+6-7
diff --git a/‎seahorse-workflow-executor/sparkutils2.2.x/build.sbt
+19 b/‎seahorse-workflow-executor/sparkutils2.2.x/build.sbt
+19
diff --git a/‎seahorse-workflow-executor/sparkutils2.2.x/src/main/scala/org/apache/spark/api/r/SparkRBackend.scala
+44 b/‎seahorse-workflow-executor/sparkutils2.2.x/src/main/scala/org/apache/spark/api/r/SparkRBackend.scala
+44
diff --git a/‎seahorse-workflow-executor/sparkutils2.x/src/main/scala/ai/deepsense/sparkutils/DataframeToDriverCsvFileWriter.scala
+42 b/‎seahorse-workflow-executor/sparkutils2.x/src/main/scala/ai/deepsense/sparkutils/DataframeToDriverCsvFileWriter.scala
+42
@@ -32,6 +32,7 @@ run_tests() {
   sbt clean
   sbt -DSPARK_VERSION=$1 test ds-it
 }
+run_tests 2.2.0
 run_tests 2.1.1
 run_tests 2.0.2
 
 
@@ -29,3 +29,4 @@ build_workflowexecutor "2.0.1"
 build_workflowexecutor "2.0.2"
 build_workflowexecutor "2.1.0"
 build_workflowexecutor "2.1.1"
+build_workflowexecutor "2.2.0"
@@ -25,7 +25,7 @@ ENV SPARK_PACKAGE spark-$SPARK_VERSION-bin-hadoop$HADOOP_VERSION
 ENV SPARK_HOME /opt/spark-$SPARK_VERSION
 ENV PATH $PATH:$SPARK_HOME/bin
 RUN wget -q -O - \
-  "http://d3kbcqa49mib13.cloudfront.net/$SPARK_PACKAGE.tgz" \
+  "https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK_PACKAGE.tgz" \
   | gunzip \
   | tar x -C /tmp/ \
   && mv /tmp/$SPARK_PACKAGE $SPARK_HOME \
 
@@ -33,6 +33,7 @@ object Version {
 
   val spark = sys.props.getOrElse("SPARK_VERSION", "2.1.1")
   val (scala, hadoop, akka, sprayRoutingLib) = spark match {
+    case "2.2.0"  => ("2.11.8", "2.7", "2.4.9", "routing-shapeless2")
     case "2.1.0" | "2.1.1" => ("2.11.8", "2.7", "2.4.9", "routing-shapeless2")
     case "2.0.0" | "2.0.1" | "2.0.2" => ("2.11.8", "2.7", "2.4.9", "routing")
   }
 
@@ -16,7 +16,7 @@ assign("sc", get(".sparkRjsc", envir = SparkR:::.sparkREnv), envir = .GlobalEnv)
 sparkSQLSession <- SparkR:::callJMethod(entryPoint, "getNewSparkSQLSession")
 
 sparkVersion <- SparkR:::callJMethod(sc, "version")
-if (sparkVersion %in% c("2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1")) {
+if (sparkVersion %in% c("2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1", "2.2.0", "2.2.1")) {
   assign(".sparkRsession", SparkR:::callJMethod(sparkSQLSession, "getSparkSession"), envir = SparkR:::.sparkREnv)
   assign("spark", get(".sparkRsession", envir = SparkR:::.sparkREnv), envir = .GlobalEnv)
 } else {
 
@@ -22,23 +22,51 @@ lazy val settingsForNotPublished = CommonSettingsPlugin.assemblySettings ++
 
 lazy val sparkVersion = Version.spark
 
+
+
 lazy val sparkUtils = sparkVersion match {
   case "2.0.0" | "2.0.1" | "2.0.2" =>
     val sparkUtils2_0_x = project in file("sparkutils2.0.x") settings settingsForPublished
     sparkUtils2_0_x
   case "2.1.0" | "2.1.1" =>
     val sparkUtils2_1_0 = project in file("sparkutils2.1.x") settings settingsForPublished
     sparkUtils2_1_0
+  case "2.2.0" =>
+    val sparkUtils2_1_0 = project in file("sparkutils2.2.x") settings settingsForPublished
+    sparkUtils2_1_0
 }
 
-lazy val sparkUtils2_x = project in file(s"sparkutils2.x") dependsOn sparkUtils settings settingsForPublished
+lazy val sparkUtils2_x = project in file(s"sparkutils2.x") dependsOn (csvlib, sparkUtils) settings settingsForPublished
+
+lazy val csv2_2 = project in file(s"sparkutilsfeatures/csv2_2") settings settingsForPublished
+lazy val csv2_0 = project in file(s"sparkutilsfeatures/csv2_0") dependsOn sparkUtils settings settingsForPublished
+
+lazy val csvlib = sparkVersion match {
+  case "2.0.0" | "2.0.1" | "2.0.2" =>
+    csv2_0
+  case "2.1.0" | "2.1.1" =>
+    csv2_0
+  case "2.2.0" =>
+    csv2_2
+}
+
+lazy val readjsondataset = project in file(s"sparkutilsfeatures/readjsondataset") dependsOn sparkUtils2_x settings settingsForPublished
+lazy val readjsondataframe = project in file(s"sparkutilsfeatures/readjsondataframe") dependsOn sparkUtils2_x settings settingsForPublished
+
+lazy val readjson = sparkVersion match {
+  case "2.0.0" | "2.0.1" | "2.0.2" => readjsondataframe
+  case "2.1.0" | "2.1.1" => readjsondataframe
+  case "2.2.0" => readjsondataset
+}
 
 lazy val rootProject = project
   .in(file("."))
   .settings(name := "seahorse")
   .settings(PublishSettings.disablePublishing)
   .aggregate(
     api,
+    csvlib,
+    readjson,
     sparkUtils2_x,
     sparkUtils,
     commons,
@@ -54,7 +82,7 @@ lazy val api = project settings settingsForPublished
 
 lazy val commons = project dependsOn (api, sparkUtils2_x) settings settingsForPublished
 
-lazy val deeplang = project dependsOn (commons,
+lazy val deeplang = project dependsOn (commons, readjson, csvlib,
 commons % "test->test",
 graph,
 graph % "test->test",
 
@@ -15,8 +15,8 @@
 
 
 TASKS="clean scalastyle test:scalastyle test it:compile ds-it"
-#sbt -DSPARK_VERSION=2.0.0 $TASKS
-#sbt -DSPARK_VERSION=2.0.1 $TASKS
-#sbt -DSPARK_VERSION=2.1.0 $TASKS
+sbt -DSPARK_VERSION=2.0.0 $TASKS
+sbt -DSPARK_VERSION=2.0.1 $TASKS
+sbt -DSPARK_VERSION=2.1.0 $TASKS
 sbt -DSPARK_VERSION=2.1.1 $TASKS
-sbt -DSPARK_VERSION=2.0.2 $TASKS
+sbt -DSPARK_VERSION=2.2.0 $TASKS
@@ -19,18 +19,20 @@ package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage
 import java.io.{File, IOException, PrintWriter}
 
 import scala.io.Source
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.execution.datasources.csv.{DataframeToDriverCsvFileWriter, RawCsvRDDToDataframe}
-import org.apache.spark.sql.{SaveMode, DataFrame => SparkDataFrame}
+import org.apache.spark.sql.{Dataset, Encoders, Row, SaveMode, DataFrame => SparkDataFrame}
 import ai.deepsense.commons.resources.ManagedResource
 import ai.deepsense.deeplang.ExecutionContext
 import ai.deepsense.deeplang.doperables.dataframe.DataFrame
 import ai.deepsense.deeplang.doperations.inout.{InputFileFormatChoice, OutputFileFormatChoice}
 import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvOptions
 import ai.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FileScheme}
+import ai.deepsense.deeplang.readjsondataset.JsonReader
 import ai.deepsense.sparkutils.SQL
 
-object DriverFiles {
+object DriverFiles extends JsonReader {
 
   def read(driverPath: String, fileFormat: InputFileFormatChoice)
           (implicit context: ExecutionContext): SparkDataFrame = fileFormat match {
@@ -59,13 +61,14 @@ object DriverFiles {
     val lines = Source.fromFile(driverPath).getLines().toStream
     val fileLinesRdd = context.sparkContext.parallelize(lines)
 
-    RawCsvRDDToDataframe.parse(fileLinesRdd, context.sparkSQLSession, params)
+    RawCsvRDDToDataframe.parse(fileLinesRdd, context.sparkSQLSession.sparkSession, params)
   }
 
   private def readJson(driverPath: String)(implicit context: ExecutionContext) = {
     val lines = Source.fromFile(driverPath).getLines().toStream
     val fileLinesRdd = context.sparkContext.parallelize(lines)
-    context.sparkSQLSession.read.json(fileLinesRdd)
+    val sparkSession = context.sparkSQLSession.sparkSession
+    readJsonFromRdd(fileLinesRdd, sparkSession)
   }
 
   private def writeCsv
@@ -77,7 +80,8 @@ object DriverFiles {
       dataFrame.sparkDataFrame,
       params,
       dataFrame.schema.get,
-      path.pathWithoutScheme
+      path.pathWithoutScheme,
+      context.sparkSQLSession.sparkSession
     )
   }
 
 
@@ -55,7 +55,7 @@ function networkRm {
 ACTION=$1
 SPARK_VERSION=$2
 export SPARK_VERSION=${SPARK_VERSION}
-if  [ "$SPARK_VERSION" == "2.1.0" ] || [ "$SPARK_VERSION" == "2.1.1" ]; then
+if  [ "$SPARK_VERSION" == "2.1.0" ] || [ "$SPARK_VERSION" == "2.1.1" ] || [ "$SPARK_VERSION" == "2.2.0" ]; then
   export HADOOP_VERSION="2.7"
   # We use 2.7.1 for Spark 2.1.x despite the fact that the latter depends on 2.7.3, but 2.7.3
   # doesn't have docker image released yet.
 
@@ -20,6 +20,7 @@ object Version {
 
   val spark = sys.props.getOrElse("SPARK_VERSION", "2.1.1")
   val (scala, java, hadoop, akka, apacheCommons) = spark match {
+    case "2.2.0" => ("2.11.8", "1.8", "2.7.3", "2.4.9", "3.5")
     case "2.1.0" | "2.1.1" => ("2.11.8", "1.8", "2.7.3", "2.4.9", "3.5")
     case "2.0.0" | "2.0.1" | "2.0.2" => ("2.11.8", "1.8", "2.7.1", "2.4.9", "3.3.+")
   }
 
@@ -91,7 +91,7 @@ def _run_custom_code(self, workflow_id, node_id, custom_operation_code):
 
         new_sql_context = None
         spark_version = self.spark_context.version
-        if spark_version in ["2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1"]:
+        if spark_version in ["2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1", "2.2.0"]:
             new_sql_context = SQLContext(self.spark_context, new_spark_session)
         else:
             raise ValueError("Spark version {} is not supported".format(spark_version))
 
@@ -71,7 +71,7 @@ def _initialize_spark_contexts(gateway):
         java_spark_sql_session = gateway.entry_point.getSparkSQLSession()
         spark_version = spark_context.version
         spark_sql_session = None
-        if spark_version in ["2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1"]:
+        if spark_version in ["2.0.0", "2.0.1", "2.0.2", "2.1.0", "2.1.1", "2.2.0"]:
             from pyspark.sql import SparkSession
             java_spark_session = java_spark_sql_session.getSparkSession()
             spark_sql_session = SparkSession(spark_context, java_spark_session)
 
@@ -22,7 +22,7 @@
     <parameters>
       <parameter name="regex">true</parameter>
       <parameter name="header"><![CDATA[/\*\*
- \* Copyright 201[567] deepsense.ai \(CodiLime, Inc\)
+ \* Copyright 201[56789] deepsense.ai \(CodiLime, Inc\)
  \*
  \* Licensed under the Apache License, Version 2.0 \(the \"License\"\);
  \* you may not use this file except in compliance with the License\.
 
@@ -14,19 +14,18 @@
  * limitations under the License.
  */
 
+
 package org.apache.spark.sql.execution.datasources.csv
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql.execution.LogicalRDD
-import org.apache.spark.sql.types.{StructType, _}
+
 
 object SparkCsvReader {
   def create(options: CSVOptions): CsvReader = new CsvReader(options)
   def univocityTokenizer(
-      rdd: RDD[String],
-      header: Array[String],
-      firstLine: String,
-      options: CSVOptions): RDD[Array[String]] =
+    rdd: RDD[String],
+    header: Array[String],
+    firstLine: String,
+    options: CSVOptions): RDD[Array[String]] =
     CSVRelation.univocityTokenizer(rdd, firstLine, options)
 }
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2017 deepsense.ai (CodiLime, Inc)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+name := "seahorse-executor-sparkutils2.2.x"
+
+libraryDependencies ++= Dependencies.sparkutils(Version.spark)
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2016 deepsense.ai (CodiLime, Inc)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.r
+
+
+class SparkRBackend {
+
+  private val backend: RBackend = new RBackend()
+  private val backendThread: Thread = new Thread("SparkRBackend") {
+    override def run(): Unit = backend.run()
+  }
+
+  private var portNumber: Int = _
+  private var entryPointTrackingId: String = _
+
+  def start(entryPoint: Object): Unit = {
+    entryPointTrackingId = backend.jvmObjectTracker.addAndGetId(entryPoint).id
+    portNumber = backend.init()
+    backendThread.start()
+  }
+
+  def close(): Unit = {
+    backend.close()
+    backendThread.join()
+  }
+
+  def port: Int = portNumber
+
+  def entryPointId: String = entryPointTrackingId
+}
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2016 deepsense.ai (CodiLime, Inc)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.csv
+
+import ai.deepsense.sparkutils.readwritedataframe.ManagedResource
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+object DataframeToDriverCsvFileWriter {
+
+  def write(
+    dataFrame: DataFrame,
+    options: Map[String, String],
+    dataSchema: StructType,
+    pathWithoutScheme: String,
+    sparkSession: SparkSession): Unit = {
+    val data = dataFrame.rdd.collect()
+    val params = MapToCsvOptions(options, sparkSession.sessionState.conf)
+    ManagedResource(
+      new LocalCsvOutputWriter(dataSchema, params, pathWithoutScheme)
+    ) { writer =>
+      data.foreach(row => {
+        writer.write(row.toSeq.map(_.asInstanceOf[String]))
+      })
+    }
+  }
+
+}
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ run_tests() {`
`32`	`32`	`sbt clean`
`33`	`33`	`sbt -DSPARK_VERSION=$1 test ds-it`
`34`	`34`	`}`
	`35`	`+run_tests 2.2.0`
`35`	`36`	`run_tests 2.1.1`
`36`	`37`	`run_tests 2.0.2`
`37`	`38`
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ object Version {`
`33`	`33`
`34`	`34`	`val spark = sys.props.getOrElse("SPARK_VERSION", "2.1.1")`
`35`	`35`	`val (scala, hadoop, akka, sprayRoutingLib) = spark match {`
	`36`	`+ case "2.2.0" => ("2.11.8", "2.7", "2.4.9", "routing-shapeless2")`
`36`	`37`	`case "2.1.0" \| "2.1.1" => ("2.11.8", "2.7", "2.4.9", "routing-shapeless2")`
`37`	`38`	`case "2.0.0" \| "2.0.1" \| "2.0.2" => ("2.11.8", "2.7", "2.4.9", "routing")`
`38`	`39`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ object Version {`
`20`	`20`
`21`	`21`	`val spark = sys.props.getOrElse("SPARK_VERSION", "2.1.1")`
`22`	`22`	`val (scala, java, hadoop, akka, apacheCommons) = spark match {`
	`23`	`+ case "2.2.0" => ("2.11.8", "1.8", "2.7.3", "2.4.9", "3.5")`
`23`	`24`	`case "2.1.0" \| "2.1.1" => ("2.11.8", "1.8", "2.7.3", "2.4.9", "3.5")`
`24`	`25`	`case "2.0.0" \| "2.0.1" \| "2.0.2" => ("2.11.8", "1.8", "2.7.1", "2.4.9", "3.3.+")`
`25`	`26`	`}`