JohnSnowLabs · mykolamelnykml · May 23, 2022 · May 23, 2022 · May 23, 2022 · May 23, 2022
diff --git a/.ci/Dockerfile.build b/.ci/Dockerfile.build
@@ -0,0 +1,6 @@
+# Build set_ambient
+FROM python:3.7-alpine
+
+ENV LC_ALL=C
+
+RUN pip install databricks-cli requests pytest
diff --git a/.ci/evaluatenotebookruns.py b/.ci/evaluatenotebookruns.py
@@ -0,0 +1,50 @@
+# evaluatenotebookruns.py
+import unittest
+import json
+import glob
+import os
+import logging
+
+class TestJobOutput(unittest.TestCase):
+
+    test_output_path = '#ENV#'
+
+    # def test_performance(self):
+    #     path = self.test_output_path
+    #     statuses = []
+    #
+    #     for filename in glob.glob(os.path.join(path, '*.json')):
+    #         print('Evaluating: ' + filename)
+    #         data = json.load(open(filename))
+    #         duration = data['execution_duration']
+    #         if duration > 100000:
+    #             status = 'FAILED'
+    #         else:
+    #             status = 'SUCCESS'
+    #
+    #         statuses.append(status)
+    #
+    #     self.assertFalse('FAILED' in statuses)
+
+
+    def test_job_run(self):
+        path = self.test_output_path
+        statuses = []
+
+
+        for filename in glob.glob(os.path.join(path, '*.json')):
+            logging.info('Evaluating: ' + filename)
+            print('Evaluating: ' + filename)
+            data = json.load(open(filename))
+            print(data)
+            if data['state']['life_cycle_state'] == "RUNNING":
+                statuses.append('NOT_COMPLETED')
+            else:
+                status = data['state']['result_state']
+                statuses.append(status)
+
+        self.assertFalse('FAILED' in statuses)
+        self.assertFalse('NOT_COMPLETED' in statuses)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/.ci/executenotebook.py b/.ci/executenotebook.py
@@ -0,0 +1,116 @@
+# executenotebook.py
+#!/usr/bin/python3
+import json
+import requests
+import os
+import sys
+import getopt
+import time
+import logging
+
+
+def main():
+    workspace = ''
+    token = ''
+    clusterid = ''
+    localpath = ''
+    workspacepath = ''
+    outfilepath = ''
+    ignore = ''
+
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'hs:t:c:lwo',
+                                   ['workspace=', 'token=', 'clusterid=', 'localpath=', 'workspacepath=', 'outfilepath=', 'ignore='])
+    except getopt.GetoptError:
+        print(
+            'executenotebook.py -s <workspace> -t <token>  -c <clusterid> -l <localpath> -w <workspacepath> -o <outfilepath>)')
+        sys.exit(2)
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print(
+                'executenotebook.py -s <workspace> -t <token> -c <clusterid> -l <localpath> -w <workspacepath> -o <outfilepath>')
+            sys.exit()
+        elif opt in ('-s', '--workspace'):
+            workspace = arg
+        elif opt in ('-t', '--token'):
+            token = arg
+        elif opt in ('-c', '--clusterid'):
+            clusterid = arg
+        elif opt in ('-l', '--localpath'):
+            localpath = arg
+        elif opt in ('-w', '--workspacepath'):
+            workspacepath = arg
+        elif opt in ('-o', '--outfilepath'):
+            outfilepath = arg
+        elif opt in ('-i', '--ignore'):
+            ignore = arg
+
+    print('-s is ' + workspace)
+    print('-t is ' + token)
+    print('-c is ' + clusterid)
+    print('-l is ' + localpath)
+    print('-w is ' + workspacepath)
+    print('-o is ' + outfilepath)
+    print('-i is ' + ignore)
+    # Generate array from walking local path
+
+    ignore = ignore.split(',')
+
+    notebooks = []
+    for path, subdirs, files in os.walk(localpath):
+        for name in files:
+            if name in ignore:
+                logging.warning(f'Ignore ${name}')
+                continue
+            fullpath = path + '/' + name
+            # removes localpath to repo but keeps workspace path
+            fullworkspacepath = workspacepath + path.replace(localpath, '')
+
+            name, file_extension = os.path.splitext(fullpath)
+            if file_extension.lower() in ['.ipynb']:
+                row = [fullpath, fullworkspacepath, 1]
+                notebooks.append(row)
+
+    # run each element in list
+    for notebook in notebooks:
+        nameonly = os.path.basename(notebook[0])
+        workspacepath = notebook[1]
+
+        name, file_extension = os.path.splitext(nameonly)
+
+        # workpath removes extension
+        fullworkspacepath = workspacepath + '/' + name
+
+        print('Running job for:' + fullworkspacepath)
+        values = {'run_name': name, 'existing_cluster_id': clusterid, 'timeout_seconds': 3600, 'notebook_task': {'notebook_path': fullworkspacepath}}
+
+        resp = requests.post(workspace + '/api/2.0/jobs/runs/submit',
+                             data=json.dumps(values), auth=("token", token))
+        runjson = resp.text
+        print("runjson:" + runjson)
+        d = json.loads(runjson)
+        runid = d['run_id']
+
+        i = 0
+        waiting = True
+        while waiting:
+            time.sleep(20)
+            jobresp = requests.get(workspace + '/api/2.0/jobs/runs/get?run_id='+str(runid),
+                                   data=json.dumps(values), auth=("token", token))
+            jobjson = jobresp.text
+            print("jobjson:" + jobjson)
+            j = json.loads(jobjson)
+            current_state = j['state']['life_cycle_state']
+            runid = j['run_id']
+            if current_state in ['TERMINATED', 'INTERNAL_ERROR', 'SKIPPED'] or i >= 24:
+                break
+            i = i + 1
+
+        if outfilepath != '':
+            file = open(outfilepath + '/' +  str(runid) + '.json', 'w')
+            file.write(json.dumps(j))
+            file.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,213 @@
+@Library('jenkinslib')_
+
+cluster_id = ""
+ocr_versions = ""
+nlp_versions = ""
+nlp_healthcare_versions = ""
+databricks_versions = ""
+nlp_version_prefix = ""
+
+def DBTOKEN = "DATABRICKS_TOKEN"
+def DBURL = "https://dbc-6ca13d9d-74bb.cloud.databricks.com"
+def SCRIPTPATH = "./.ci"
+def NOTEBOOKPATH = "./databricks/python"
+def WORKSPACEPATH = "/Shared/Spark OCR/tests"
+def OUTFILEPATH = "."
+def TESTRESULTPATH = "./reports/junit"
+def IGNORE = "3. Compare CPU and GPU image processing with Spark OCR.ipynb"
+
+def SPARK_NLP_VERSION = params.nlp_version
+def SPARK_NLP_HEALTHCARE_VERSION = params.nlp_healthcare_version
+def SPARK_OCR_VERSION = params.ocr_version
+
+def PYPI_REPO_HEALTHCARE_SECRET = sparknlp_helpers.spark_nlp_healthcare_secret(SPARK_NLP_HEALTHCARE_VERSION)
+def PYPI_REPO_OCR_SECRET = sparknlp_helpers.spark_ocr_secret(SPARK_OCR_VERSION)
+
+def DATABRICKS_RUNTIME_VERSION = params.databricks_runtime == null ? '7.3.x-scala2.12' : params.databricks_runtime.tokenize('|')[1]
+def SPARK_VERSION = params.spark_version == null ? 'spark30' : params.spark_version
+
+switch(SPARK_VERSION) {
+case 'spark24':
+    nlp_version_prefix="-spark24"
+    break
+case 'spark23':
+    nlp_version_prefix="-spark23"
+    break
+case 'spark30':
+    nlp_version_prefix=""
+    break
+case 'spark32':
+    nlp_version_prefix="-spark32"
+}
+
+def String get_releases(repo)
+{
+    def sparkOcrVesrionsString = sh(returnStdout: true, script: """gh api --paginate  -H "Accept: application/vnd.github.v3+json" /repos/${repo}/releases""")
+    def sparkOcrVesrionsStringJson = readJSON text: sparkOcrVesrionsString
+    return sparkOcrVesrionsStringJson.collect{ it['tag_name']}.join("\n")
+}
+
+node {
+    withCredentials([usernamePassword(credentialsId: '55e7e818-4ccf-4d23-b54c-fd97c21081ba',
+                                                  usernameVariable: 'GITHUB_USER',
+                                                  passwordVariable: 'GITHUB_TOKEN')]) {
+        ocr_versions = get_releases("johnsnowlabs/spark-ocr")
+        nlp_versions = get_releases("johnsnowlabs/spark-nlp")
+        nlp_healthcare_versions = get_releases("johnsnowlabs/spark-nlp-internal")
+
+    }
+    withCredentials([string(credentialsId: DBTOKEN, variable: 'TOKEN')]) {
+
+    def databricksVersionsString = sh(returnStdout: true, script:'curl --header "Authorization: Bearer $TOKEN"  -X GET https://dbc-6ca13d9d-74bb.cloud.databricks.com/api/2.0/clusters/spark-versions')
+    def databricksVersionsStringJson = readJSON text: databricksVersionsString
+    databricks_versions = databricksVersionsStringJson['versions'].collect{ it['name'] + " |" + it['key']}.sort().join("\n")
+    }
+}
+
+pipeline {
+    agent {
+        dockerfile {
+                filename '.ci/Dockerfile.build'
+        }
+    }
+    environment {
+        DATABRICKS_CONFIG_FILE = ".databricks.cfg"
+        GITHUB_CREDS = credentials('55e7e818-4ccf-4d23-b54c-fd97c21081ba')
+    }
+    parameters {
+        choice(
+            name:'databricks_runtime',
+            choices: '7.3 LTS Spark 3.0.1 |7.3.x-scala2.12\n' + databricks_versions,
+            description: 'Databricks runtime version'
+        )
+        choice(
+            name:'ocr_version',
+            choices: ocr_versions,
+            description:'Spark Ocr version'
+        )
+        choice(
+            name:'spark_version',
+            choices:'spark30\nspark32\nspark24\nspark23',
+            description:'define spark version'
+            )
+        choice(
+            name:'nlp_version',
+            choices: nlp_versions,
+            description:'Spark Nlp version'
+        )
+        choice(
+            name:'nlp_healthcare_version',
+            choices: nlp_healthcare_versions,
+            description:'Spark Nlp for Healthcare version'
+        )
+    }
+    stages {
+        stage('Setup') {
+            steps {
+                script {
+                    withCredentials([string(credentialsId: DBTOKEN, variable: 'TOKEN')]) {
+                        sh('echo "${TOKEN}" > secret.txt')
+                        sh("databricks configure --token-file secret.txt --host ${DBURL}")
+                    }
+                }
+            }
+        }
+        stage('Copy notebooks to Databricks') {
+            steps {
+                script {
+                    sh("databricks  workspace import_dir -o '${NOTEBOOKPATH}' '${WORKSPACEPATH}'")
+                }
+            }
+        }
+        stage('Create Cluster') {
+            steps {
+                script {
+                    withCredentials([string(credentialsId:'TEST_SPARK_NLP_LICENSE',variable:'SPARK_OCR_LICENSE'),[
+                        $class: 'AmazonWebServicesCredentialsBinding',
+                        credentialsId: 'a4362e3b-808e-45e0-b7d2-1c62b0572df4',
+                        accessKeyVariable: 'AWS_ACCESS_KEY_ID',
+                        secretKeyVariable: 'AWS_SECRET_ACCESS_KEY']]) {
+                        def jsonCluster = """
+                        {
+                            "num_workers": 1,
+                            "cluster_name": "Spark Ocr Notebook Test",
+                            "spark_version": "${DATABRICKS_RUNTIME_VERSION}",
+                            "spark_conf": {
+                              "spark.sql.legacy.allowUntypedScalaUDF": "true"
+                            },
+                            "aws_attributes": {
+                              "first_on_demand": 1,
+                              "availability": "SPOT_WITH_FALLBACK",
+                              "zone_id": "us-west-2a",
+                              "spot_bid_price_percent": 100,
+                              "ebs_volume_count": 0
+                            },
+                            "node_type_id": "i3.xlarge",
+                            "driver_node_type_id": "i3.xlarge",
+                            "spark_env_vars": {
+                              "JSL_OCR_LICENSE": "${SPARK_OCR_LICENSE}",
+                              "AWS_ACCESS_KEY_ID": "${AWS_ACCESS_KEY_ID}",
+                              "AWS_SECRET_ACCESS_KEY": "${AWS_SECRET_ACCESS_KEY}"
+                            },
+                            "autotermination_minutes": 20
+                        }
+                        """
+                        writeFile file: 'cluster.json', text: jsonCluster
+                        def clusterRespString = sh(returnStdout: true, script: "databricks clusters create --json-file cluster.json")
+                        def clusterRespJson = readJSON text: clusterRespString
+                        cluster_id = clusterRespJson['cluster_id']
+                        sh "rm cluster.json"
+                    }
+                }
+            }
+        }
+        stage('Install deps to Cluster') {
+            steps {
+                script {
+                    sh("databricks libraries install --cluster-id ${cluster_id} --jar  s3://pypi.johnsnowlabs.com/${PYPI_REPO_OCR_SECRET}/jars/spark-ocr-assembly-${SPARK_OCR_VERSION}-${SPARK_VERSION}.jar")
+                    sh("databricks libraries install --cluster-id ${cluster_id} --jar  s3://pypi.johnsnowlabs.com/${PYPI_REPO_HEALTHCARE_SECRET}/spark-nlp-jsl-${SPARK_NLP_HEALTHCARE_VERSION}${nlp_version_prefix}.jar")
+                    sh("databricks libraries install --cluster-id ${cluster_id} --maven-coordinates com.johnsnowlabs.nlp:spark-nlp${nlp_version_prefix}_2.12:${SPARK_NLP_VERSION}")
+                    sh("databricks libraries install --cluster-id ${cluster_id} --whl s3://pypi.johnsnowlabs.com/${PYPI_REPO_OCR_SECRET}/spark-ocr/spark_ocr-${SPARK_OCR_VERSION}+${SPARK_VERSION}-py3-none-any.whl")
+                    sh("databricks libraries install --cluster-id ${cluster_id} --whl s3://pypi.johnsnowlabs.com/${PYPI_REPO_HEALTHCARE_SECRET}/spark-nlp-jsl/spark_nlp_jsl-${SPARK_NLP_VERSION}-py3-none-any.whl")
+                    sh("databricks libraries install --cluster-id ${cluster_id} --pypi-package spark-nlp==${SPARK_NLP_VERSION}")
+                    timeout(10) {
+                        waitUntil {
+                           script {
+                             def respStringWaitLib = sh script: "databricks libraries cluster-status --cluster-id ${cluster_id}", returnStdout: true
+                             def respJsonWaitLib = readJSON text: respStringWaitLib
+                             return (respJsonWaitLib['library_statuses'].every{ it['status'] == 'INSTALLED'} );
+                           }
+                        }
+                    }
+                }
+            }
+        }
+        stage('Run Notebook Tests') {
+            steps {
+                script {
+                    withCredentials([string(credentialsId: DBTOKEN, variable: 'TOKEN')]) {
+                        sh """python3 $SCRIPTPATH/executenotebook.py --workspace=$DBURL\
+                                        --token=$TOKEN\
+                                        --clusterid=${cluster_id}\
+                                        --localpath=${NOTEBOOKPATH}\
+                                        --workspacepath='${WORKSPACEPATH}'\
+                                        --outfilepath='${OUTFILEPATH}'\
+                                        --ignore='${IGNORE}'
+                           """
+                        sh """sed -i -e 's #ENV# ${OUTFILEPATH} g' ${SCRIPTPATH}/evaluatenotebookruns.py
+                              python3 -m pytest -s --junit-xml=${TESTRESULTPATH}/TEST-notebookout.xml ${SCRIPTPATH}/evaluatenotebookruns.py
+                           """
+
+                    }
+                }
+            }
+        }
+    }
+    post {
+        always {
+            sh "databricks clusters permanent-delete --cluster-id ${cluster_id}"
+            sh "find ${OUTFILEPATH} -name '*.json' -exec rm {} +"
+            junit allowEmptyResults: true, testResults: "**/reports/junit/*.xml"
+        }
+    }
+}