From be9469af73bdf633ef0b34e6dca4efdcf9d0e956 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Mon, 13 Jun 2022 13:26:18 -0700 Subject: [PATCH 01/11] remove test exclusions: draft --- build.sbt | 2 +- .../synapse/ml/nbtest/SynapseTests.scala | 26 ++++++++------- .../synapse/ml/nbtest/SynapseUtilities.scala | 18 +++++++++-- ...est - Multivariate Anomaly Detection.ipynb | 31 ++++++++++-------- ...g - BiLSTM Medical Entity Extraction.ipynb | 13 +++++--- ...rpretability - Explanation Dashboard.ipynb | 4 +++ pipeline.yaml | 32 +++++++++---------- 7 files changed, 76 insertions(+), 50 deletions(-) diff --git a/build.sbt b/build.sbt index a3a3f72b44..d65dd94e4a 100644 --- a/build.sbt +++ b/build.sbt @@ -33,7 +33,7 @@ val extraDependencies = Seq( "com.jcraft" % "jsch" % "0.1.54", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.linkedin.isolation-forest" %% "isolation-forest_3.2.0" % "2.0.8" + "com.linkedin.isolation-forest" %% "isolation-forest_3.2.0" % "3.0.0" ).map(d => d excludeAll (excludes: _*)) val dependencies = coreDependencies ++ extraDependencies diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index 559431eeac..2c0f6c03dc 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -46,7 +46,6 @@ class SynapseTests extends TestBase { val resourcesDirectory = new File(getClass.getResource("/").toURI) val notebooksDir = new File(resourcesDirectory, "generated-notebooks") - println(s"Notebooks dir: $notebooksDir") FileUtils.deleteDirectory(notebooksDir) assert(notebooksDir.mkdirs()) @@ -103,23 +102,25 @@ class SynapseTests extends TestBase { runCmd(activateCondaEnv ++ Seq("jupyter", "nbconvert", "--to", "python", "*.ipynb"), notebooksDir) +// println(notebo oksDir) val selectedPythonFiles: Array[File] = FileUtilities.recursiveListFiles(notebooksDir) .filter(_.getAbsolutePath.endsWith(".py")) - .filterNot(_.getAbsolutePath.contains("HyperParameterTuning")) - .filterNot(_.getAbsolutePath.contains("CyberML")) - .filterNot(_.getAbsolutePath.contains("VowpalWabbitOverview")) - .filterNot(_.getAbsolutePath.contains("IsolationForest")) - .filterNot(_.getAbsolutePath.contains("ExplanationDashboard")) - .filterNot(_.getAbsolutePath.contains("DeepLearning")) - .filterNot(_.getAbsolutePath.contains("InterpretabilitySnowLeopardDetection")) +// .filter(_.getAbsolutePath.contains("HyperParameterTuning")) +// .filter(_.getAbsolutePath.contains("CyberML")) +// .filter(_.getAbsolutePath.contains("VowpalWabbitOverview")) + .filter(_.getAbsolutePath.contains("IsolationForest")) +// .filter(_.getAbsolutePath.contains("DeepLearning")) +// .filter(_.getAbsolutePath.contains("BiLSTM")) +// .filter(_.getAbsolutePath.contains("InterpretabilitySnowLeopardDetection")) .sortBy(_.getAbsolutePath) - selectedPythonFiles.foreach(println) - assert(selectedPythonFiles.length > 1) - val expectedPoolCount: Int = selectedPythonFiles.length - println("SynapseTests E2E Test Suite starting...") + assert(expectedPoolCount >= 1) + println(s"SynapseTests E2E Test Suite starting on ${expectedPoolCount} notebook(s)...") + selectedPythonFiles.foreach(println) + + // Cleanup old stray spark pools lying around due to ungraceful test shutdown tryDeleteOldSparkPools() println(s"Creating $expectedPoolCount Spark Pools...") @@ -155,6 +156,7 @@ class SynapseTests extends TestBase { failures.foreach(failure => println(failure.failed.get.getMessage)) } + //FileUtils.deleteDirectory(notebooksDir) super.afterAll() } } diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index fa3e08888a..cc63cb7f8e 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -123,7 +123,7 @@ object SynapseUtilities { lazy val ArmToken: String = getAccessToken("https://management.azure.com/") val LineSeparator: String = sys.props("line.separator").toLowerCase // Platform agnostic (\r\n:windows, \n:linux) - val Folder = s"build_${BuildInfo.version}/scripts" + val Folder = s"build_0.9.5-113-3519038c-SNAPSHOT/scripts" //s"build_${BuildInfo.version}/scripts" val TimeoutInMillis: Int = 30 * 60 * 1000 // 30 minutes val StorageAccount: String = "mmlsparkbuildsynapse" val StorageContainer: String = "synapse" @@ -197,6 +197,10 @@ object SynapseUtilities { "org.scalactic:scalactic_2.12", "org.scalatest:scalatest_2.12", "org.slf4j:slf4j-api").mkString(",") + val packages: String = Seq( + "com.microsoft.azure:synapseml_2.12:0.9.5-113-3519038c-SNAPSHOT", + //"com.microsoft.azure:synapseml_2.12:${BuildInfo.version}" + "org.apache.spark:spark-avro_2.12:3.2.0").mkString(",") val runName = abfssPath.split('/').last.replace(".py", "") val livyPayload: String = s""" @@ -210,7 +214,7 @@ object SynapseUtilities { | "numExecutors" : 2, | "conf" : | { - | "spark.jars.packages" : "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}", + | "spark.jars.packages" : "$packages", | "spark.jars.repositories" : "https://mmlspark.azureedge.net/maven", | "spark.jars.excludes": "$excludes", | "spark.driver.userClassPathFirst": "true", @@ -238,13 +242,21 @@ object SynapseUtilities { poolLocation: String, poolNodeSize: String, createdAtTime: String): String = { + var foobar: String = "notfound" + if (sys.env.getOrElse("ppruthienv", "asdf") != "asdf") + foobar = "found" + val buildId: String = sys.env.getOrElse("Build.BuildId", "unknown") + val buildNumber: String = sys.env.getOrElse("Build.BuildNumber", "unknown") s""" |{ | "name": "$bigDataPoolName", | "location": "$poolLocation", | "tags": { | "createdBy": "SynapseE2E Tests", - | "createdAt": "$createdAtTime" + | "createdAt": "$createdAtTime", + | "buildId": "$buildId", + | "buildNumber": "$buildNumber", + | "foobar": "$foobar" | }, | "properties": { | "autoPause": { diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 5d8baa97e9..460d1f30f4 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -45,6 +45,8 @@ "outputs": [], "source": [ "import os\n", + "from IPython import get_ipython\n", + "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n", "import uuid\n", "import mlflow\n", "import matplotlib.pyplot as plt\n", @@ -72,6 +74,8 @@ }, "outputs": [], "source": [ + "shell = TerminalInteractiveShell.instance()\n", + "shell.define_macro('foo', \"\"\"a,b=10,20\"\"\")\n", "%matplotlib inline" ] }, @@ -83,7 +87,8 @@ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display" + " from notebookutils.visualization import display\n", + " import org.apache.spark.sql.avro" ], "metadata": { "collapsed": false, @@ -327,11 +332,20 @@ } }, "source": [ - "Next, we create an ML pipeline to train the Isolation Forest model. We also demonstrate how to create an MLFlow experiement and register the trained model.\n", + "Next, we create an ML pipeline to train the Isolation Forest model. We also demonstrate how to create an MLFlow experiment and register the trained model.\n", "\n", "Note that MLFlow model registration is strictly only required if accessing the trained model at a later time. For training the model, and performing inferencing in the same notebook, the model object model is sufficient." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade sqlparse" + ] + }, { "cell_type": "code", "execution_count": null, @@ -350,7 +364,7 @@ " va = VectorAssembler(inputCols=inputCols, outputCol=\"features\")\n", " pipeline = Pipeline(stages=[va, isolationForest])\n", " model = pipeline.fit(df_train)\n", - " mlflow.spark.log_model(model, artifact_path=artifact_path,registered_model_name=model_name)" + " mlflow.spark.log_model(model, artifact_path=artifact_path, registered_model_name=model_name)" ] }, { @@ -608,16 +622,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --upgrade raiwidgets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --upgrade interpret-community" + "!pip install --upgrade raiwidgets interpret-community" ] }, { diff --git a/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb index 289b06f569..e6d36481a4 100644 --- a/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb +++ b/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb @@ -40,7 +40,6 @@ "import os, tarfile, pickle\n", "import urllib.request\n", "import nltk\n", - "\n", "import os\n", "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", @@ -74,9 +73,10 @@ " dataDir = \"/dbfs/nltkdata\"\n", "\n", "d = ModelDownloader(spark, modelDir)\n", - "modelSchema = d.downloadByName(modelName)\n", + "rmodelSchema = d.downloadByName(modelName)\n", "nltk.download(\"punkt\", dataDir)\n", - "nltk.data.path.append(dataDir)" + "nltk.data.path.append(dataDir)\n", + "print(\"ppruthi hdinsight ends, dataDir\" + dataDir)" ] }, { @@ -88,12 +88,15 @@ }, "outputs": [], "source": [ + "print(\"ppruthi local starts\")\n", "modelName = \"BiLSTM\"\n", "modelDir = abspath(\"models\")\n", "if not os.path.exists(modelDir): os.makedirs(modelDir)\n", + "print(\"ppruthi:\" + modelDir)\n", "d = ModelDownloader(spark, \"file://\" + modelDir)\n", "modelSchema = d.downloadByName(modelName)\n", - "nltk.download(\"punkt\")" + "nltk.download(\"punkt\")\n", + "print(\"ppruthi local ends\")" ] }, { @@ -177,7 +180,7 @@ "source": [ "# Add the tokenizers to all worker nodes\n", "def prepNLTK(partition):\n", - " nltk.data.path.append(\"/dbfs/nltkdata\")\n", + " nltk.data.path.append(dataDir)\n", " return partition\n", "\n", "df = df.rdd.mapPartitions(prepNLTK).toDF()" diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb index be834f9716..22847d94b3 100644 --- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb @@ -33,6 +33,8 @@ "outputs": [], "source": [ "import pyspark\n", + "from IPython import get_ipython\n", + "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n", "from synapse.ml.explainers import *\n", "from pyspark.ml import Pipeline\n", "from pyspark.ml.classification import LogisticRegression\n", @@ -299,6 +301,8 @@ }, "outputs": [], "source": [ + "shell = TerminalInteractiveShell.instance()\n", + "shell.define_macro('foo', \"\"\"a,b=10,20\"\"\")\n", "!pip install --upgrade raiwidgets\n", "!pip install itsdangerous==2.0.1" ] diff --git a/pipeline.yaml b/pipeline.yaml index 0d4360f9f4..76f67cdf0b 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -35,7 +35,7 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Scala Style Check' inputs: azureSubscription: 'MMLSpark Build' @@ -111,7 +111,7 @@ jobs: PGP-PRIVATE: $(pgp-private) PGP-PUBLIC: $(pgp-public) PGP-PW: $(pgp-pw) - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'E2E' inputs: azureSubscription: 'MMLSpark Build' @@ -148,14 +148,14 @@ jobs: PGP-PRIVATE: $(pgp-private) PGP-PUBLIC: $(pgp-public) PGP-PW: $(pgp-pw) - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'E2E' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript inlineScript: | source activate synapseml - sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" + ppruthienv=$(Build.BuildId) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" condition: and(succeeded(), eq(variables.runTests, 'True')) - task: PublishTestResults@2 displayName: 'Publish Test Results' @@ -169,7 +169,7 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Get Docker Tag + Version' inputs: azureSubscription: 'MMLSpark Build' @@ -341,7 +341,7 @@ jobs: - template: templates/update_cli.yml - template: templates/conda.yml - template: templates/kv.yml - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Install Pip Package' timeoutInMinutes: 10 inputs: @@ -351,7 +351,7 @@ jobs: source activate synapseml (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) sbt installPipPackage - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Test Python Code' timeoutInMinutes: 40 inputs: @@ -366,7 +366,7 @@ jobs: testResultsFiles: '**/python-test-*.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' @@ -389,7 +389,7 @@ jobs: - template: templates/kv.yml - bash: curl https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -o spark-3.2.0-bin-hadoop3.2.tgz displayName: Download spark - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Test R Code' timeoutInMinutes: 30 inputs: @@ -405,7 +405,7 @@ jobs: testResultsFiles: '**/r-test-*.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' @@ -425,7 +425,7 @@ jobs: - template: templates/update_cli.yml - template: templates/conda.yml - template: templates/kv.yml - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Test Website Samples' timeoutInMinutes: 30 inputs: @@ -444,7 +444,7 @@ jobs: testResultsFiles: '**/website-test-result.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' @@ -469,7 +469,7 @@ jobs: inputs: versionSpec: '16.x' displayName: 'Install Node.js' - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Convert notebooks to markdowns' inputs: azureSubscription: 'MMLSpark Build' @@ -579,7 +579,7 @@ jobs: PACKAGE: "vw" steps: #- template: templates/ivy_cache.yml - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Setup repo' inputs: azureSubscription: 'MMLSpark Build' @@ -587,7 +587,7 @@ jobs: inlineScript: | (timeout 30s pip install requests) || (echo "retrying" && timeout 30s pip install requests) (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Unit Test' timeoutInMinutes: 90 inputs: @@ -608,7 +608,7 @@ jobs: testResultsFiles: '**/test-reports/TEST-*.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' From 85166321ade5e0f3b82ea36b2505a6967432d1e6 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 13:53:29 -0700 Subject: [PATCH 02/11] fix nits --- .../synapse/ml/nbtest/SynapseUtilities.scala | 17 +++++------------ ...orest - Multivariate Anomaly Detection.ipynb | 3 +-- pipeline.yaml | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index cc63cb7f8e..4f49a266e2 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -123,7 +123,7 @@ object SynapseUtilities { lazy val ArmToken: String = getAccessToken("https://management.azure.com/") val LineSeparator: String = sys.props("line.separator").toLowerCase // Platform agnostic (\r\n:windows, \n:linux) - val Folder = s"build_0.9.5-113-3519038c-SNAPSHOT/scripts" //s"build_${BuildInfo.version}/scripts" + val Folder = s"build_${BuildInfo.version}/scripts" val TimeoutInMillis: Int = 30 * 60 * 1000 // 30 minutes val StorageAccount: String = "mmlsparkbuildsynapse" val StorageContainer: String = "synapse" @@ -187,7 +187,7 @@ object SynapseUtilities { val dest = s"$Folder/${notebook.getName}" exec(s"az storage fs file upload " + s" -s ${notebook.getAbsolutePath} -p $dest -f $StorageContainer " + - s" --overwrite true " + + " --overwrite true " + s" --account-name $StorageAccount --account-key ${Secrets.SynapseStorageKey}") val abfssPath = s"abfss://$StorageContainer@$StorageAccount.dfs.core.windows.net/$dest" @@ -197,10 +197,7 @@ object SynapseUtilities { "org.scalactic:scalactic_2.12", "org.scalatest:scalatest_2.12", "org.slf4j:slf4j-api").mkString(",") - val packages: String = Seq( - "com.microsoft.azure:synapseml_2.12:0.9.5-113-3519038c-SNAPSHOT", - //"com.microsoft.azure:synapseml_2.12:${BuildInfo.version}" - "org.apache.spark:spark-avro_2.12:3.2.0").mkString(",") + val packages: String = "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}" val runName = abfssPath.split('/').last.replace(".py", "") val livyPayload: String = s""" @@ -242,11 +239,8 @@ object SynapseUtilities { poolLocation: String, poolNodeSize: String, createdAtTime: String): String = { - var foobar: String = "notfound" - if (sys.env.getOrElse("ppruthienv", "asdf") != "asdf") - foobar = "found" - val buildId: String = sys.env.getOrElse("Build.BuildId", "unknown") - val buildNumber: String = sys.env.getOrElse("Build.BuildNumber", "unknown") + val buildId: String = sys.env.getOrElse("AdoBuildId", "unknown") + val buildNumber: String = sys.env.getOrElse("AdoBuildNumber", "unknown") s""" |{ | "name": "$bigDataPoolName", @@ -256,7 +250,6 @@ object SynapseUtilities { | "createdAt": "$createdAtTime", | "buildId": "$buildId", | "buildNumber": "$buildNumber", - | "foobar": "$foobar" | }, | "properties": { | "autoPause": { diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 656da51b1f..55db6a7dbf 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -88,8 +88,7 @@ " from pyspark.sql import SparkSession\n", "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n", - " import org.apache.spark.sql.avro" + " from notebookutils.visualization import display" ], "metadata": { "collapsed": false, diff --git a/pipeline.yaml b/pipeline.yaml index 76f67cdf0b..9fb22d5a20 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -155,7 +155,7 @@ jobs: scriptLocation: inlineScript inlineScript: | source activate synapseml - ppruthienv=$(Build.BuildId) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" + AdoBuildId=$(Build.BuildId) AdoBuildNumber=$(Build.BuildId) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" condition: and(succeeded(), eq(variables.runTests, 'True')) - task: PublishTestResults@2 displayName: 'Publish Test Results' From bd8d70705992fd667002dcd67f4c523d5901bf4d Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 14:00:19 -0700 Subject: [PATCH 03/11] cleanup for PR --- .../azure/synapse/ml/nbtest/SynapseTests.scala | 16 +++++++--------- ...ning - BiLSTM Medical Entity Extraction.ipynb | 13 +++++-------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index 2c0f6c03dc..cd7f02fa10 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -102,16 +102,14 @@ class SynapseTests extends TestBase { runCmd(activateCondaEnv ++ Seq("jupyter", "nbconvert", "--to", "python", "*.ipynb"), notebooksDir) -// println(notebo oksDir) val selectedPythonFiles: Array[File] = FileUtilities.recursiveListFiles(notebooksDir) .filter(_.getAbsolutePath.endsWith(".py")) -// .filter(_.getAbsolutePath.contains("HyperParameterTuning")) -// .filter(_.getAbsolutePath.contains("CyberML")) -// .filter(_.getAbsolutePath.contains("VowpalWabbitOverview")) - .filter(_.getAbsolutePath.contains("IsolationForest")) -// .filter(_.getAbsolutePath.contains("DeepLearning")) -// .filter(_.getAbsolutePath.contains("BiLSTM")) -// .filter(_.getAbsolutePath.contains("InterpretabilitySnowLeopardDetection")) + .filterNot(_.getAbsolutePath.contains("HyperParameterTuning")) + .filterNot(_.getAbsolutePath.contains("CyberML")) + .filterNot(_.getAbsolutePath.contains("VowpalWabbitOverview")) + .filterNot(_.getAbsolutePath.contains("IsolationForest")) + .filterNot(_.getAbsolutePath.contains("DeepLearning")) + .filterNot(_.getAbsolutePath.contains("InterpretabilitySnowLeopardDetection")) .sortBy(_.getAbsolutePath) val expectedPoolCount: Int = selectedPythonFiles.length @@ -156,7 +154,7 @@ class SynapseTests extends TestBase { failures.foreach(failure => println(failure.failed.get.getMessage)) } - //FileUtils.deleteDirectory(notebooksDir) + FileUtils.deleteDirectory(notebooksDir) super.afterAll() } } diff --git a/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb index dd36848a6d..a0b48da08d 100644 --- a/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb +++ b/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb @@ -40,6 +40,7 @@ "import os, tarfile, pickle\n", "import urllib.request\n", "import nltk\n", + "\n", "import os\n", "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", @@ -74,10 +75,9 @@ " dataDir = \"/dbfs/nltkdata\"\n", "\n", "d = ModelDownloader(spark, modelDir)\n", - "rmodelSchema = d.downloadByName(modelName)\n", + "modelSchema = d.downloadByName(modelName)\n", "nltk.download(\"punkt\", dataDir)\n", - "nltk.data.path.append(dataDir)\n", - "print(\"ppruthi hdinsight ends, dataDir\" + dataDir)" + "nltk.data.path.append(dataDir)" ] }, { @@ -89,16 +89,13 @@ }, "outputs": [], "source": [ - "print(\"ppruthi local starts\")\n", "modelName = \"BiLSTM\"\n", "modelDir = abspath(\"models\")\n", "if not os.path.exists(modelDir):\n", " os.makedirs(modelDir)\n", - "print(\"ppruthi:\" + modelDir)\n", "d = ModelDownloader(spark, \"file://\" + modelDir)\n", "modelSchema = d.downloadByName(modelName)\n", - "nltk.download(\"punkt\")\n", - "print(\"ppruthi local ends\")" + "nltk.download(\"punkt\")" ] }, { @@ -184,7 +181,7 @@ "source": [ "# Add the tokenizers to all worker nodes\n", "def prepNLTK(partition):\n", - " nltk.data.path.append(dataDir)\n", + " nltk.data.path.append(\"/dbfs/nltkdata\")\n", " return partition\n", "\n", "\n", From 1b2e60f4e70e907f476630240de1ad3ef0af2364 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 14:02:34 -0700 Subject: [PATCH 04/11] nit --- pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.yaml b/pipeline.yaml index 9fb22d5a20..23160135e6 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -155,7 +155,7 @@ jobs: scriptLocation: inlineScript inlineScript: | source activate synapseml - AdoBuildId=$(Build.BuildId) AdoBuildNumber=$(Build.BuildId) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" + AdoBuildId=$(Build.BuildId) AdoBuildNumber=$(Build.BuildNumber) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" condition: and(succeeded(), eq(variables.runTests, 'True')) - task: PublishTestResults@2 displayName: 'Publish Test Results' From af8b49ba9792515b22d7d56d331642b63fbf090f Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 14:14:34 -0700 Subject: [PATCH 05/11] Add required argument to pipeline's AzureCLI tasks --- pipeline.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pipeline.yaml b/pipeline.yaml index 23160135e6..07aecbbe01 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -40,6 +40,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt scalastyle test:scalastyle' - task: UsePythonVersion@00 inputs: @@ -116,6 +117,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.DatabricksTests"' condition: and(succeeded(), eq(variables.runTests, 'True')) - task: PublishTestResults@2 @@ -153,6 +155,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml AdoBuildId=$(Build.BuildId) AdoBuildNumber=$(Build.BuildNumber) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" @@ -174,6 +177,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | VERSION=$(sbt "core/version" | tail -1 | cut -d' ' -f2 | sed 's/\x1b\[[0-9;]*m//g') echo '##vso[task.setvariable variable=version]'$VERSION @@ -347,6 +351,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) @@ -357,6 +362,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml (sbt "project $(PACKAGE)" coverage testPython) || (sbt "project $(PACKAGE)" coverage testPython) || (sbt "project $(PACKAGE)" coverage testPython) @@ -371,6 +377,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/codecov.yml @@ -395,6 +402,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) @@ -410,6 +418,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/codecov.yml @@ -431,6 +440,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | set -e source activate synapseml @@ -449,6 +459,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/codecov.yml @@ -474,6 +485,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml sbt convertNotebooks @@ -584,6 +596,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | (timeout 30s pip install requests) || (echo "retrying" && timeout 30s pip install requests) (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) @@ -593,6 +606,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | (${FFMPEG:-false} && sudo add-apt-repository ppa:jonathonf/ffmpeg-4 -y && \ sudo apt-get update && sudo apt-get install ffmpeg libgstreamer1.0-0 \ @@ -613,6 +627,7 @@ jobs: inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/kv.yml From 8de59cf9f26699da47eaca736e9cafc8e9de0d78 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 15:09:47 -0700 Subject: [PATCH 06/11] fix bugs --- .../microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala | 2 +- .../IsolationForest - Multivariate Anomaly Detection.ipynb | 2 +- .../Interpretability - Explanation Dashboard.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index 4f49a266e2..4ee51da4d6 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -197,7 +197,7 @@ object SynapseUtilities { "org.scalactic:scalactic_2.12", "org.scalatest:scalatest_2.12", "org.slf4j:slf4j-api").mkString(",") - val packages: String = "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}" + val packages: String = s"com.microsoft.azure:synapseml_2.12:${BuildInfo.version}" val runName = abfssPath.split('/').last.replace(".py", "") val livyPayload: String = s""" diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 55db6a7dbf..69459a27c3 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -75,7 +75,7 @@ "outputs": [], "source": [ "shell = TerminalInteractiveShell.instance()\n", - "shell.define_macro('foo', \"\"\"a,b=10,20\"\"\")\n", + "shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", "%matplotlib inline" ] }, diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb index c255a6ac9d..5190d54e8b 100644 --- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb @@ -323,7 +323,7 @@ "outputs": [], "source": [ "shell = TerminalInteractiveShell.instance()\n", - "shell.define_macro('foo', \"\"\"a,b=10,20\"\"\")\n", + "shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", "!pip install --upgrade raiwidgets\n", "!pip install itsdangerous==2.0.1" ] From 77379e1a5d0bd8b9db5f40c817fcaafdbdecac64 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 15:59:03 -0700 Subject: [PATCH 07/11] do interactive shell stuff only for Azure Spark Machine runtime --- .../IsolationForest - Multivariate Anomaly Detection.ipynb | 7 +++---- .../Interpretability - Explanation Dashboard.ipynb | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 69459a27c3..a5521ad879 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -74,8 +74,6 @@ }, "outputs": [], "source": [ - "shell = TerminalInteractiveShell.instance()\n", - "shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", "%matplotlib inline" ] }, @@ -86,8 +84,9 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", - "\n", " spark = SparkSession.builder.getOrCreate()\n", + " shell = TerminalInteractiveShell.instance()\n", + " shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", " from notebookutils.visualization import display" ], "metadata": { @@ -1008,4 +1007,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb index 5190d54e8b..18fbac8415 100644 --- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb @@ -46,8 +46,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", - "\n", " spark = SparkSession.builder.getOrCreate()\n", + " shell = TerminalInteractiveShell.instance()\n", + " shell.define_macro('foo', \"\"\"a,b=10,20\"\"\")\n", " from notebookutils.visualization import display\n", "\n", "\n", @@ -322,8 +323,6 @@ }, "outputs": [], "source": [ - "shell = TerminalInteractiveShell.instance()\n", - "shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", "!pip install --upgrade raiwidgets\n", "!pip install itsdangerous==2.0.1" ] @@ -478,4 +477,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} From 9a1f0503191cb8e6bacc4cc042ff8b9841f9db55 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Wed, 15 Jun 2022 16:05:28 -0700 Subject: [PATCH 08/11] fix style check --- .../IsolationForest - Multivariate Anomaly Detection.ipynb | 1 + .../Interpretability - Explanation Dashboard.ipynb | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index a5521ad879..98999b476b 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -84,6 +84,7 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " shell = TerminalInteractiveShell.instance()\n", " shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb index 18fbac8415..70b3d91da0 100644 --- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb @@ -46,9 +46,10 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " shell = TerminalInteractiveShell.instance()\n", - " shell.define_macro('foo', \"\"\"a,b=10,20\"\"\")\n", + " shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", " from notebookutils.visualization import display\n", "\n", "\n", From d2ff80b553d8b309ebf470fbd06e88708ec4a0cb Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Thu, 16 Jun 2022 11:28:52 -0700 Subject: [PATCH 09/11] fix isolation forest notebook --- .../azure/synapse/ml/nbtest/SynapseUtilities.scala | 1 + ...tionForest - Multivariate Anomaly Detection.ipynb | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index 4ee51da4d6..d8a02de3e3 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -294,6 +294,7 @@ object SynapseUtilities { sparkPools.foreach(sparkPool => { val name = sparkPool.name.stripPrefix(s"$WorkspaceName/") if (sparkPool.tags.contains("createdAt") && sparkPool.tags.contains("createdBy")) { + assert(name.stripPrefix(ClusterPrefix).length == dayAgoTsInMillis.toString.length) val creationTime = name.stripPrefix(ClusterPrefix).toLong if (creationTime <= dayAgoTsInMillis) { try { diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 98999b476b..fb1f1635ae 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -365,11 +365,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, "outputs": [], "source": [ - "!pip install --upgrade sqlparse" - ] + "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", + " !pip install --upgrade sqlparse\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } }, { "cell_type": "code", From 0826b617813172d28ee1c220648d1bbe3e2a46da Mon Sep 17 00:00:00 2001 From: Puneet Pruthi Date: Thu, 16 Jun 2022 11:35:58 -0700 Subject: [PATCH 10/11] fix style checks --- build.sbt | 2 +- .../IsolationForest - Multivariate Anomaly Detection.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index d65dd94e4a..a3a3f72b44 100644 --- a/build.sbt +++ b/build.sbt @@ -33,7 +33,7 @@ val extraDependencies = Seq( "com.jcraft" % "jsch" % "0.1.54", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.linkedin.isolation-forest" %% "isolation-forest_3.2.0" % "3.0.0" + "com.linkedin.isolation-forest" %% "isolation-forest_3.2.0" % "2.0.8" ).map(d => d excludeAll (excludes: _*)) val dependencies = coreDependencies ++ extraDependencies diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index fb1f1635ae..e1a6fcb448 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -368,7 +368,7 @@ "outputs": [], "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", - " !pip install --upgrade sqlparse\n" + " !pip install --upgrade sqlparse" ], "metadata": { "collapsed": false, From c94b2643be411867e8b67ebe4d746002375cd442 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Thu, 16 Jun 2022 16:14:10 -0400 Subject: [PATCH 11/11] Update core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala --- .../com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index cd7f02fa10..9fe5ed49e4 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -154,7 +154,6 @@ class SynapseTests extends TestBase { failures.foreach(failure => println(failure.failed.get.getMessage)) } - FileUtils.deleteDirectory(notebooksDir) super.afterAll() } }