From d38c82455b28585cb35f6f4386a508ff4026f1d3 Mon Sep 17 00:00:00 2001 From: Puneet Pruthi <33642858+ppruthi@users.noreply.github.com> Date: Thu, 16 Jun 2022 13:15:37 -0700 Subject: [PATCH] fix: remove exclusion for ExplanationDashboard notebook (#1531) * remove test exclusions: draft * fix nits * cleanup for PR * nit * Add required argument to pipeline's AzureCLI tasks * fix bugs * do interactive shell stuff only for Azure Spark Machine runtime * fix style check * fix isolation forest notebook * fix style checks * Update core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala Co-authored-by: Puneet Pruthi --- .../synapse/ml/nbtest/SynapseTests.scala | 11 ++--- .../synapse/ml/nbtest/SynapseUtilities.scala | 12 +++-- ...est - Multivariate Anomaly Detection.ipynb | 34 +++++++++----- ...rpretability - Explanation Dashboard.ipynb | 6 ++- pipeline.yaml | 47 ++++++++++++------- 5 files changed, 72 insertions(+), 38 deletions(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index 559431eeac..9fe5ed49e4 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -46,7 +46,6 @@ class SynapseTests extends TestBase { val resourcesDirectory = new File(getClass.getResource("/").toURI) val notebooksDir = new File(resourcesDirectory, "generated-notebooks") - println(s"Notebooks dir: $notebooksDir") FileUtils.deleteDirectory(notebooksDir) assert(notebooksDir.mkdirs()) @@ -109,17 +108,17 @@ class SynapseTests extends TestBase { .filterNot(_.getAbsolutePath.contains("CyberML")) .filterNot(_.getAbsolutePath.contains("VowpalWabbitOverview")) .filterNot(_.getAbsolutePath.contains("IsolationForest")) - .filterNot(_.getAbsolutePath.contains("ExplanationDashboard")) .filterNot(_.getAbsolutePath.contains("DeepLearning")) .filterNot(_.getAbsolutePath.contains("InterpretabilitySnowLeopardDetection")) .sortBy(_.getAbsolutePath) - selectedPythonFiles.foreach(println) - assert(selectedPythonFiles.length > 1) - val expectedPoolCount: Int = selectedPythonFiles.length - println("SynapseTests E2E Test Suite starting...") + assert(expectedPoolCount >= 1) + println(s"SynapseTests E2E Test Suite starting on ${expectedPoolCount} notebook(s)...") + selectedPythonFiles.foreach(println) + + // Cleanup old stray spark pools lying around due to ungraceful test shutdown tryDeleteOldSparkPools() println(s"Creating $expectedPoolCount Spark Pools...") diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index fa3e08888a..d8a02de3e3 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -187,7 +187,7 @@ object SynapseUtilities { val dest = s"$Folder/${notebook.getName}" exec(s"az storage fs file upload " + s" -s ${notebook.getAbsolutePath} -p $dest -f $StorageContainer " + - s" --overwrite true " + + " --overwrite true " + s" --account-name $StorageAccount --account-key ${Secrets.SynapseStorageKey}") val abfssPath = s"abfss://$StorageContainer@$StorageAccount.dfs.core.windows.net/$dest" @@ -197,6 +197,7 @@ object SynapseUtilities { "org.scalactic:scalactic_2.12", "org.scalatest:scalatest_2.12", "org.slf4j:slf4j-api").mkString(",") + val packages: String = s"com.microsoft.azure:synapseml_2.12:${BuildInfo.version}" val runName = abfssPath.split('/').last.replace(".py", "") val livyPayload: String = s""" @@ -210,7 +211,7 @@ object SynapseUtilities { | "numExecutors" : 2, | "conf" : | { - | "spark.jars.packages" : "com.microsoft.azure:synapseml_2.12:${BuildInfo.version}", + | "spark.jars.packages" : "$packages", | "spark.jars.repositories" : "https://mmlspark.azureedge.net/maven", | "spark.jars.excludes": "$excludes", | "spark.driver.userClassPathFirst": "true", @@ -238,13 +239,17 @@ object SynapseUtilities { poolLocation: String, poolNodeSize: String, createdAtTime: String): String = { + val buildId: String = sys.env.getOrElse("AdoBuildId", "unknown") + val buildNumber: String = sys.env.getOrElse("AdoBuildNumber", "unknown") s""" |{ | "name": "$bigDataPoolName", | "location": "$poolLocation", | "tags": { | "createdBy": "SynapseE2E Tests", - | "createdAt": "$createdAtTime" + | "createdAt": "$createdAtTime", + | "buildId": "$buildId", + | "buildNumber": "$buildNumber", | }, | "properties": { | "autoPause": { @@ -289,6 +294,7 @@ object SynapseUtilities { sparkPools.foreach(sparkPool => { val name = sparkPool.name.stripPrefix(s"$WorkspaceName/") if (sparkPool.tags.contains("createdAt") && sparkPool.tags.contains("createdBy")) { + assert(name.stripPrefix(ClusterPrefix).length == dayAgoTsInMillis.toString.length) val creationTime = name.stripPrefix(ClusterPrefix).toLong if (creationTime <= dayAgoTsInMillis) { try { diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 780ce7725f..e1a6fcb448 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -45,6 +45,8 @@ "outputs": [], "source": [ "import os\n", + "from IPython import get_ipython\n", + "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n", "import uuid\n", "import mlflow\n", "import matplotlib.pyplot as plt\n", @@ -84,6 +86,8 @@ " from pyspark.sql import SparkSession\n", "\n", " spark = SparkSession.builder.getOrCreate()\n", + " shell = TerminalInteractiveShell.instance()\n", + " shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", " from notebookutils.visualization import display" ], "metadata": { @@ -353,11 +357,26 @@ } }, "source": [ - "Next, we create an ML pipeline to train the Isolation Forest model. We also demonstrate how to create an MLFlow experiement and register the trained model.\n", + "Next, we create an ML pipeline to train the Isolation Forest model. We also demonstrate how to create an MLFlow experiment and register the trained model.\n", "\n", "Note that MLFlow model registration is strictly only required if accessing the trained model at a later time. For training the model, and performing inferencing in the same notebook, the model object model is sufficient." ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", + " !pip install --upgrade sqlparse" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, { "cell_type": "code", "execution_count": null, @@ -637,16 +656,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --upgrade raiwidgets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --upgrade interpret-community" + "!pip install --upgrade raiwidgets interpret-community" ] }, { @@ -1004,4 +1014,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} \ No newline at end of file +} diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb index 7b1519b4d0..70b3d91da0 100644 --- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb @@ -33,6 +33,8 @@ "outputs": [], "source": [ "import pyspark\n", + "from IPython import get_ipython\n", + "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n", "from synapse.ml.explainers import *\n", "from pyspark.ml import Pipeline\n", "from pyspark.ml.classification import LogisticRegression\n", @@ -46,6 +48,8 @@ " from pyspark.sql import SparkSession\n", "\n", " spark = SparkSession.builder.getOrCreate()\n", + " shell = TerminalInteractiveShell.instance()\n", + " shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n", " from notebookutils.visualization import display\n", "\n", "\n", @@ -474,4 +478,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/pipeline.yaml b/pipeline.yaml index 17c1e73527..a0dc65c7cd 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -35,11 +35,12 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Scala Style Check' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt scalastyle test:scalastyle' - task: UsePythonVersion@00 inputs: @@ -111,11 +112,12 @@ jobs: PGP-PRIVATE: $(pgp-private) PGP-PUBLIC: $(pgp-public) PGP-PW: $(pgp-pw) - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'E2E' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.DatabricksTests"' condition: and(succeeded(), eq(variables.runTests, 'True')) - task: PublishTestResults@2 @@ -148,14 +150,15 @@ jobs: PGP-PRIVATE: $(pgp-private) PGP-PUBLIC: $(pgp-public) PGP-PW: $(pgp-pw) - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'E2E' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml - sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" + AdoBuildId=$(Build.BuildId) AdoBuildNumber=$(Build.BuildNumber) sbt "testOnly com.microsoft.azure.synapse.ml.nbtest.SynapseTests" condition: and(succeeded(), eq(variables.runTests, 'True')) - task: PublishTestResults@2 displayName: 'Publish Test Results' @@ -169,11 +172,12 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Get Docker Tag + Version' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | VERSION=$(sbt "core/version" | tail -1 | cut -d' ' -f2 | sed 's/\x1b\[[0-9;]*m//g') echo '##vso[task.setvariable variable=version]'$VERSION @@ -341,22 +345,24 @@ jobs: - template: templates/update_cli.yml - template: templates/conda.yml - template: templates/kv.yml - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Install Pip Package' timeoutInMinutes: 10 inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) sbt installPipPackage - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Test Python Code' timeoutInMinutes: 40 inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml (sbt "project $(PACKAGE)" coverage testPython) || (sbt "project $(PACKAGE)" coverage testPython) || (sbt "project $(PACKAGE)" coverage testPython) @@ -366,11 +372,12 @@ jobs: testResultsFiles: '**/python-test-*.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/codecov.yml @@ -389,12 +396,13 @@ jobs: - template: templates/kv.yml - bash: curl https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -o spark-3.2.0-bin-hadoop3.2.tgz displayName: Download spark - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Test R Code' timeoutInMinutes: 30 inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) @@ -405,11 +413,12 @@ jobs: testResultsFiles: '**/r-test-*.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/codecov.yml @@ -425,12 +434,13 @@ jobs: - template: templates/update_cli.yml - template: templates/conda.yml - template: templates/kv.yml - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Test Website Samples' timeoutInMinutes: 30 inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | set -e source activate synapseml @@ -444,11 +454,12 @@ jobs: testResultsFiles: '**/website-test-result.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/codecov.yml @@ -469,11 +480,12 @@ jobs: inputs: versionSpec: '16.x' displayName: 'Install Node.js' - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Convert notebooks to markdowns' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | source activate synapseml sbt convertNotebooks @@ -579,20 +591,22 @@ jobs: PACKAGE: "vw" steps: #- template: templates/ivy_cache.yml - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Setup repo' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | (timeout 30s pip install requests) || (echo "retrying" && timeout 30s pip install requests) (timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) || (echo "retrying" && timeout 5m sbt setup) - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Unit Test' timeoutInMinutes: 90 inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: | (${FFMPEG:-false} && sudo add-apt-repository ppa:jonathonf/ffmpeg-4 -y && \ sudo apt-get update && sudo apt-get install ffmpeg libgstreamer1.0-0 \ @@ -608,11 +622,12 @@ jobs: testResultsFiles: '**/test-reports/TEST-*.xml' failTaskOnFailedTests: true condition: succeededOrFailed() - - task: AzureCLI@1 + - task: AzureCLI@2 displayName: 'Generate Codecov report' inputs: azureSubscription: 'MMLSpark Build' scriptLocation: inlineScript + scriptType: bash inlineScript: 'sbt coverageReport' condition: succeededOrFailed() - template: templates/kv.yml