diff --git a/.gitignore b/.gitignore index 8fd3247cec1..ec1a9ac15e9 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,7 @@ node_modules/ .Rproj.user # R output -*.Rout \ No newline at end of file +*.Rout + +# Misc +.bsp diff --git a/build.sbt b/build.sbt index 0d10df561f6..001a2e02916 100644 --- a/build.sbt +++ b/build.sbt @@ -2,21 +2,16 @@ import java.io.{File, PrintWriter} import java.net.URL import org.apache.commons.io.FileUtils import sbt.ExclusionRule -import sbt.internal.util.ManagedLogger - import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} import scala.xml.transform.{RewriteRule, RuleTransformer} -import scala.sys.process.Process import BuildUtils._ +import CodegenPlugin.autoImport.publishPython +import xerial.sbt.Sonatype._ -val condaEnvName = "mmlspark" -name := "mmlspark" -organization := "com.microsoft.ml.spark" -scalaVersion := "2.12.10" +ThisBuild / organization := "com.microsoft.ml.spark" +ThisBuild / scalaVersion := "2.12.10" val sparkVersion = "3.0.1" -//val scalaMajorVersion = settingKey[String]("scalaMajorVersion") -//scalaMajorVersion := {scalaVersion.value.split(".".toCharArray).dropRight(0).mkString(".")} val scalaMajorVersion = 2.12 val excludes = Seq( @@ -24,42 +19,28 @@ val excludes = Seq( ExclusionRule("org.scalatest") ) -libraryDependencies ++= Seq( +val coreDependencies = Seq( "org.apache.spark" %% "spark-core" % sparkVersion % "compile", "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile", "org.apache.spark" %% "spark-avro" % sparkVersion % "provided", "org.apache.spark" %% "spark-tags" % sparkVersion % "test", "org.scalatest" %% "scalatest" % "3.0.5" % "test") - -libraryDependencies ++= Seq( +val extraDependencies = Seq( "org.scalactic" %% "scalactic" % "3.0.5", "io.spray" %% "spray-json" % "1.3.2", - "com.microsoft.cntk" % "cntk" % "2.4", - "org.openpnp" % "opencv" % "3.2.0-1", "com.jcraft" % "jsch" % "0.1.54", - "com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110", - "com.github.vowpalwabbit" % "vw-jni" % "8.9.1", "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1", ).map(d => d excludeAll (excludes: _*)) +val dependencies = coreDependencies ++ extraDependencies def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\"" -def activateCondaEnv: Seq[String] = { - if (sys.props("os.name").toLowerCase.contains("windows")) { - osPrefix ++ Seq("activate", condaEnvName, "&&") - } else { - Seq() - //TODO figure out why this doesent work - //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") - } -} - val omittedDeps = Set(s"spark-core_${scalaMajorVersion}", s"spark-mllib_${scalaMajorVersion}", "org.scala-lang") // skip dependency elements with a scope -pomPostProcess := { (node: XmlNode) => + +def pomPostFunc(node: XmlNode): scala.xml.Node = { new RuleTransformer(new RewriteRule { override def transform(node: XmlNode): XmlNodeSeq = node match { case e: Elem if e.label == "dependency" @@ -77,191 +58,17 @@ pomPostProcess := { (node: XmlNode) => }).transform(node).head } -resolvers += "Speech" at "https://mmlspark.blob.core.windows.net/maven/" - -val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") -createCondaEnvTask := { - val s = streams.value - val hasEnv = Process("conda env list").lineStream.toList - .map(_.split("\\s+").head).contains(condaEnvName) - if (!hasEnv) { - runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) - } else { - println("Found conda env " + condaEnvName) - } -} - -val condaEnvLocation = TaskKey[String]("condaEnvLocation", "get install location of conda env") -condaEnvLocation := { - val s = streams.value - createCondaEnvTask.value - Process("conda env list").lineStream.toList - .map(_.split("\\s+")) - .map(l => (l.head, l.reverse.head)) - .filter(p => p._1 == condaEnvName) - .head._2 -} - - -val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") -cleanCondaEnvTask := { - runCmd(Seq("conda", "env", "remove", "--name", condaEnvName, "-y")) -} - -val codegenTask = TaskKey[Unit]("codegen", "Generate Code") -codegenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value -} - -val testgenTask = TaskKey[Unit]("testgen", "Generate Tests") -testgenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.TestGen").value -} - -val genDir = join("target", s"scala-${scalaMajorVersion}", "generated") -val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") -val pythonSrcDir = join(genDir.toString, "src", "python") -val unifiedDocDir = join(genDir.toString, "doc") -val pythonDocDir = join(unifiedDocDir.toString, "pyspark") -val pythonPackageDir = join(genDir.toString, "package", "python") -val pythonTestDir = join(genDir.toString, "test", "python") -val rSrcDir = join(genDir.toString, "src", "R", "mmlspark") -val rPackageDir = join(genDir.toString, "package", "R") - -val pythonizedVersion = settingKey[String]("Pythonized version") -pythonizedVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head + ".dev1" - } else { - version.value - } -} - -val rVersion = settingKey[String]("R version") -rVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head - } else { - version.value - } -} - -def rCmd(cmd: Seq[String], wd: File, libPath: String): Unit = { - runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) -} - -val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") -packageR := { - createCondaEnvTask.value - codegenTask.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) - rPackageDir.mkdirs() - zipFolder(rSrcDir, new File(rPackageDir, s"mmlspark-${version.value}.zip")) -} - -val testR = TaskKey[Unit]("testR", "Run testthat on R tests") -testR := { - packageR.value - publishLocal.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", "mmlspark"), rSrcDir.getParentFile, libPath) - val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath - rCmd(Seq("Rscript", testRunner), rSrcDir, libPath) -} - -val publishR = TaskKey[Unit]("publishR", "publish R package to blob") -publishR := { - codegenTask.value - packageR.value - val rPackage = rPackageDir.listFiles().head - singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") -} - -val packagePythonTask = TaskKey[Unit]("packagePython", "Package python sdk") -packagePythonTask := { - codegenTask.value - createCondaEnvTask.value - val destPyDir = join("target", s"scala-${scalaMajorVersion}", "classes", "mmlspark") - if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) - FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir) - runCmd( - activateCondaEnv ++ - Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", s"${pythonPackageDir.absolutePath}"), - pythonSrcDir) -} - -val installPipPackageTask = TaskKey[Unit]("installPipPackage", "install python sdk") -installPipPackageTask := { - packagePythonTask.value - publishLocal.value - runCmd( - activateCondaEnv ++ Seq("pip", "install", "-I", - s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl"), - pythonPackageDir) -} - -val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") -generatePythonDoc := { - installPipPackageTask.value - runCmd(activateCondaEnv ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), - join(pythonSrcDir.toString, "mmlspark")) - runCmd(activateCondaEnv ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), - join(pythonSrcDir.toString, "mmlspark")) -} - -val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") -publishDocs := { - generatePythonDoc.value - (Compile / unidoc).value - val html = - """ - |
-      |pyspark/
-      |scala/
-      |
- """.stripMargin - val scalaDir = join(unifiedDocDir.toString, "scala") - if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) - FileUtils.copyDirectory(unidocDir, scalaDir) - FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") - uploadToBlob(unifiedDocDir.toString, version.value, "docs") -} - -val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") -publishPython := { - publishLocal.value - packagePythonTask.value - singleUploadToBlob( - join(pythonPackageDir.toString, s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl").toString, - version.value + s"/mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl", - "pip") -} +pomPostProcess := pomPostFunc -val testPythonTask = TaskKey[Unit]("testPython", "test python sdk") - -testPythonTask := { - installPipPackageTask.value - testgenTask.value - runCmd( - activateCondaEnv ++ Seq("python", - "-m", - "pytest", - "--cov=mmlspark", - "--junitxml=../../../../python-test-results.xml", - "--cov-report=xml", - "mmlsparktest" - ), - new File(s"target/scala-${scalaMajorVersion}/generated/test/python/") - ) -} +val speechResolver = "Speech" at "https://mmlspark.blob.core.windows.net/maven/" val getDatasetsTask = TaskKey[Unit]("getDatasets", "download datasets used for testing") val datasetName = "datasets-2020-08-27.tgz" val datasetUrl = new URL(s"https://mmlspark.blob.core.windows.net/installers/$datasetName") val datasetDir = settingKey[File]("The directory that holds the dataset") -datasetDir := { - join(target.value.toString, s"scala-${scalaMajorVersion}", "datasets", datasetName.split(".".toCharArray.head).head) +ThisBuild / datasetDir := { + join(artifactPath.in(packageBin).in(Compile).value.getParentFile, + "datasets", datasetName.split(".".toCharArray.head).head) } getDatasetsTask := { @@ -276,49 +83,40 @@ getDatasetsTask := { val genBuildInfo = TaskKey[Unit]("genBuildInfo", "generate a build info file") genBuildInfo := { - val buildInfo = + val docInfo = s""" - |MMLSpark Build and Release Information - |--------------- - | - |### Maven Coordinates - | `${organization.value}:${name.value}_${scalaMajorVersion}:${version.value}` - | - |### Maven Resolver - | `https://mmlspark.azureedge.net/maven` | |### Documentation Pages: |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) | """.stripMargin + val buildInfo = (root / blobArtifactInfo).value + docInfo val infoFile = join("target", "Build.md") if (infoFile.exists()) FileUtils.forceDelete(infoFile) FileUtils.writeStringToFile(infoFile, buildInfo, "utf-8") } -val setupTask = TaskKey[Unit]("setup", "set up library for intellij") -setupTask := { - (Compile / compile).toTask.value - (Test / compile).toTask.value - getDatasetsTask.value +val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") +val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") +publishDocs := { + (root / generatePythonDoc).value + (root / Compile / unidoc).value + val html = + """ + |
+      |pyspark/
+      |scala/
+      |
+ """.stripMargin + val unifiedDocDir = join((root / codegenDir).value, "doc") + val scalaDir = join(unifiedDocDir.toString, "scala") + if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) + FileUtils.copyDirectory(unidocDir, scalaDir) + FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") + uploadToBlob(unifiedDocDir.toString, version.value, "docs") } -val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") -publishBlob := { - publishM2.value - val scalaVersionSuffix = scalaVersion.value.split(".".toCharArray.head).dropRight(1).mkString(".") - val nameAndScalaVersion = s"${name.value}_$scalaVersionSuffix" - - val localPackageFolder = join( - Seq(new File(new URI(Resolver.mavenLocal.root)).getAbsolutePath) - ++ organization.value.split(".".toCharArray.head) - ++ Seq(nameAndScalaVersion, version.value): _*).toString - - val blobMavenFolder = organization.value.replace(".", "/") + - s"/$nameAndScalaVersion/${version.value}" - uploadToBlob(localPackageFolder, blobMavenFolder, "maven") -} val release = TaskKey[Unit]("release", "publish the library to mmlspark blob") release := Def.taskDyn { @@ -355,11 +153,8 @@ publishBadges := { } val settings = Seq( - (scalastyleConfig in Test) := baseDirectory.value / "scalastyle-test-config.xml", + (scalastyleConfig in Test) := (ThisBuild / baseDirectory).value / "scalastyle-test-config.xml", logBuffered in Test := false, - buildInfoKeys := Seq[BuildInfoKey]( - name, version, scalaVersion, sbtVersion, - baseDirectory, datasetDir, pythonizedVersion, rVersion), parallelExecution in Test := false, test in assembly := {}, assemblyMergeStrategy in assembly := { @@ -367,14 +162,88 @@ val settings = Seq( case x => MergeStrategy.first }, assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false), - buildInfoPackage := "com.microsoft.ml.spark.build") - -lazy val mmlspark = (project in file(".")) - .enablePlugins(BuildInfoPlugin) - .enablePlugins(ScalaUnidocPlugin) - .settings(settings: _*) + buildInfoPackage := "com.microsoft.ml.spark.build", + autoAPIMappings := true, + publishMavenStyle := true, + pomPostProcess := pomPostFunc +) +ThisBuild / publishMavenStyle := true + +lazy val core = (project in file("core")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .settings((settings ++ Seq( + libraryDependencies := dependencies, + buildInfoKeys += datasetDir, + name := "mmlspark-core" + )): _*) + +lazy val deepLearning = (project in file("deep-learning")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.microsoft.cntk" % "cntk" % "2.4"), + name := "mmlspark-deep-learning" + )): _*) + +lazy val lightgbm = (project in file("lightgbm")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110"), + name := "mmlspark-lightgbm" + )): _*) + +lazy val vw = (project in file("vw")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.github.vowpalwabbit" % "vw-jni" % "8.9.1"), + name := "mmlspark-vw" + )): _*) + +lazy val cognitive = (project in file("cognitive")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0"), + resolvers += speechResolver, + name := "mmlspark-cognitive" + )): _*) + +lazy val opencv = (project in file("opencv")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("org.openpnp" % "opencv" % "3.2.0-1"), + name := "mmlspark-opencv" + )): _*) + +lazy val root = (project in file(".")) + .aggregate(core, deepLearning, cognitive, vw, lightgbm, opencv) + .dependsOn(core, deepLearning, cognitive, vw, lightgbm, opencv) + .enablePlugins(BuildInfoPlugin && ScalaUnidocPlugin && SbtPlugin) + .settings(settings ++ Seq( + name := "mmlspark", + buildInfoKeys += datasetDir, + genJarName := None, + publishPython / aggregate := false, + testPython / aggregate := false, + publishR / aggregate := false, + installPipPackage / aggregate := false, + )) -import xerial.sbt.Sonatype._ +val setupTask = TaskKey[Unit]("setup", "set up library for intellij") +setupTask := { + compile.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile, Test))).value + getDatasetsTask.value +} sonatypeProjectHosting := Some( GitHubHosting("Azure", "MMLSpark", "mmlspark-support@microsot.com")) @@ -389,7 +258,6 @@ developers := List( ) licenses += ("MIT", url("https://github.com/Azure/mmlspark/blob/master/LICENSE")) -publishMavenStyle := true credentials += Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", @@ -416,6 +284,4 @@ pgpPublicRing := { dynverSonatypeSnapshots in ThisBuild := true dynverSeparator in ThisBuild := "-" -publishTo := sonatypePublishToBundle.value - -// Break Cache - 1 +publishTo := sonatypePublishToBundle.value \ No newline at end of file diff --git a/src/main/python/mmlspark/cognitive/AzureSearchWriter.py b/cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py similarity index 100% rename from src/main/python/mmlspark/cognitive/AzureSearchWriter.py rename to cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py diff --git a/src/main/python/mmlspark/cognitive/BingImageSearch.py b/cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py similarity index 100% rename from src/main/python/mmlspark/cognitive/BingImageSearch.py rename to cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py diff --git a/src/__init__.py b/cognitive/src/main/python/mmlspark/cognitive/__init__.py similarity index 100% rename from src/__init__.py rename to cognitive/src/main/python/mmlspark/cognitive/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala index 96024a68b63..b405bb13b09 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala @@ -143,7 +143,8 @@ object AzureSearchWriter extends IndexParser with SLogging { val Logger: Logger = LogManager.getRootLogger - private def checkForErrors(fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { + private def checkForErrors( + fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { Option(errorRow).map { r => val message = s"Service Exception:\n\t ${r.toString()} \n for input:\n\t ${inputRow.toString()}" if (fatal) { diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala index 51a965b0d08..45447ac5f2d 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala @@ -8,15 +8,17 @@ import java.lang.ProcessBuilder.Redirect import java.net.{URI, URL} import java.util.UUID import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} + import com.microsoft.cognitiveservices.speech._ import com.microsoft.cognitiveservices.speech.audio._ -import com.microsoft.cognitiveservices.speech.transcription.{Conversation, ConversationTranscriber, - ConversationTranscriptionEventArgs, Participant} +import com.microsoft.cognitiveservices.speech.transcription.{ + Conversation, ConversationTranscriber, ConversationTranscriptionEventArgs, Participant} import com.microsoft.cognitiveservices.speech.util.EventHandler import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.cognitive.SpeechFormat._ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.{DatasetExtensions, SparkBindings} +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.io.http.HasURL import com.microsoft.ml.spark.logging.BasicLogging import com.microsoft.ml.spark.{CompressedStream, WavStream} @@ -36,10 +38,6 @@ import spray.json._ import scala.concurrent.{ExecutionContext, Future, blocking} import scala.language.existentials -object OsUtils { - val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 -} - object SpeechToTextSDK extends ComplexParamsReadable[SpeechToTextSDK] private[ml] class BlockingQueueIterator[T](lbq: LinkedBlockingQueue[Option[T]], diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala diff --git a/src/main/__init__.py b/cognitive/src/test/python/mmlsparktest/cognitive/__init__.py similarity index 100% rename from src/main/__init__.py rename to cognitive/src/test/python/mmlsparktest/cognitive/__init__.py diff --git a/src/test/python/mmlsparktest/cognitive/test_simple.py b/cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/test_simple.py rename to cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala index 11a75834a4f..6255d9462b4 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala @@ -9,12 +9,10 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.NamespaceInjections.pipelineModel import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.functions.{corr, typedLit} +import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalactic.Equality -import org.scalatest.Assertion import com.microsoft.ml.spark.FluentAPI._ -import com.microsoft.ml.spark.featurize.text.PageSplitter trait CognitiveKey { lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey) diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala similarity index 94% rename from src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala index 620ab98aa28..d88d70d63af 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala @@ -1,11 +1,12 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.core.utils +package com.microsoft.ml.spark.core.utils.utils import com.microsoft.ml.spark.cognitive.TextSentiment import com.microsoft.ml.spark.core.env.FileUtilities.join import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.stages.DropColumns class ModelEqualitySuite extends TestBase { diff --git a/src/main/python/LICENSE.txt b/core/src/main/python/LICENSE.txt similarity index 100% rename from src/main/python/LICENSE.txt rename to core/src/main/python/LICENSE.txt diff --git a/src/main/python/MANIFEST.in b/core/src/main/python/MANIFEST.in similarity index 100% rename from src/main/python/MANIFEST.in rename to core/src/main/python/MANIFEST.in diff --git a/src/main/python/__init__.py b/core/src/main/python/__init__.py similarity index 100% rename from src/main/python/__init__.py rename to core/src/main/python/__init__.py diff --git a/src/main/python/mmlspark/README.txt b/core/src/main/python/mmlspark/README.txt similarity index 100% rename from src/main/python/mmlspark/README.txt rename to core/src/main/python/mmlspark/README.txt diff --git a/src/main/python/mmlspark/__init__.py b/core/src/main/python/mmlspark/__init__.py similarity index 100% rename from src/main/python/mmlspark/__init__.py rename to core/src/main/python/mmlspark/__init__.py diff --git a/src/main/python/mmlspark/automl/BestModel.py b/core/src/main/python/mmlspark/automl/BestModel.py similarity index 100% rename from src/main/python/mmlspark/automl/BestModel.py rename to core/src/main/python/mmlspark/automl/BestModel.py diff --git a/src/main/python/mmlspark/automl/HyperparamBuilder.py b/core/src/main/python/mmlspark/automl/HyperparamBuilder.py similarity index 100% rename from src/main/python/mmlspark/automl/HyperparamBuilder.py rename to core/src/main/python/mmlspark/automl/HyperparamBuilder.py diff --git a/src/main/python/mmlspark/automl/TuneHyperparametersModel.py b/core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py similarity index 100% rename from src/main/python/mmlspark/automl/TuneHyperparametersModel.py rename to core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py diff --git a/src/main/python/mmlspark/automl/__init__.py b/core/src/main/python/mmlspark/automl/__init__.py similarity index 100% rename from src/main/python/mmlspark/automl/__init__.py rename to core/src/main/python/mmlspark/automl/__init__.py diff --git a/src/main/python/mmlspark/cntk/__init__.py b/core/src/main/python/mmlspark/core/__init__.py similarity index 100% rename from src/main/python/mmlspark/cntk/__init__.py rename to core/src/main/python/mmlspark/core/__init__.py diff --git a/src/main/python/mmlspark/core/schema/TypeConversionUtils.py b/core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/TypeConversionUtils.py rename to core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py diff --git a/src/main/python/mmlspark/core/schema/Utils.py b/core/src/main/python/mmlspark/core/schema/Utils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/Utils.py rename to core/src/main/python/mmlspark/core/schema/Utils.py diff --git a/src/main/python/mmlspark/cognitive/__init__.py b/core/src/main/python/mmlspark/core/schema/__init__.py similarity index 100% rename from src/main/python/mmlspark/cognitive/__init__.py rename to core/src/main/python/mmlspark/core/schema/__init__.py diff --git a/src/main/python/mmlspark/core/__init__.py b/core/src/main/python/mmlspark/core/serialize/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/__init__.py rename to core/src/main/python/mmlspark/core/serialize/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/java_params_patch.py b/core/src/main/python/mmlspark/core/serialize/java_params_patch.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/java_params_patch.py rename to core/src/main/python/mmlspark/core/serialize/java_params_patch.py diff --git a/src/main/python/mmlspark/core/spark/FluentAPI.py b/core/src/main/python/mmlspark/core/spark/FluentAPI.py similarity index 100% rename from src/main/python/mmlspark/core/spark/FluentAPI.py rename to core/src/main/python/mmlspark/core/spark/FluentAPI.py diff --git a/src/main/python/mmlspark/core/schema/__init__.py b/core/src/main/python/mmlspark/core/spark/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/schema/__init__.py rename to core/src/main/python/mmlspark/core/spark/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/__init__.py b/core/src/main/python/mmlspark/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/__init__.py rename to core/src/main/python/mmlspark/cyber/__init__.py diff --git a/src/main/python/mmlspark/core/spark/__init__.py b/core/src/main/python/mmlspark/cyber/anomaly/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/spark/__init__.py rename to core/src/main/python/mmlspark/cyber/anomaly/__init__.py diff --git a/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py b/core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py rename to core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py diff --git a/src/main/python/mmlspark/cyber/anomaly/complement_access.py b/core/src/main/python/mmlspark/cyber/anomaly/complement_access.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/complement_access.py rename to core/src/main/python/mmlspark/cyber/anomaly/complement_access.py diff --git a/src/main/python/mmlspark/cyber/dataset.py b/core/src/main/python/mmlspark/cyber/dataset.py similarity index 100% rename from src/main/python/mmlspark/cyber/dataset.py rename to core/src/main/python/mmlspark/cyber/dataset.py diff --git a/src/main/python/mmlspark/cyber/__init__.py b/core/src/main/python/mmlspark/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/__init__.py rename to core/src/main/python/mmlspark/cyber/feature/__init__.py diff --git a/src/main/python/mmlspark/cyber/feature/indexers.py b/core/src/main/python/mmlspark/cyber/feature/indexers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/indexers.py rename to core/src/main/python/mmlspark/cyber/feature/indexers.py diff --git a/src/main/python/mmlspark/cyber/feature/scalers.py b/core/src/main/python/mmlspark/cyber/feature/scalers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/scalers.py rename to core/src/main/python/mmlspark/cyber/feature/scalers.py diff --git a/src/main/python/mmlspark/cyber/anomaly/__init__.py b/core/src/main/python/mmlspark/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/__init__.py rename to core/src/main/python/mmlspark/cyber/utils/__init__.py diff --git a/src/main/python/mmlspark/cyber/utils/spark_utils.py b/core/src/main/python/mmlspark/cyber/utils/spark_utils.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/spark_utils.py rename to core/src/main/python/mmlspark/cyber/utils/spark_utils.py diff --git a/src/main/python/mmlspark/doc/conf.py b/core/src/main/python/mmlspark/doc/conf.py similarity index 100% rename from src/main/python/mmlspark/doc/conf.py rename to core/src/main/python/mmlspark/doc/conf.py diff --git a/src/main/python/mmlspark/doc/index.rst b/core/src/main/python/mmlspark/doc/index.rst similarity index 100% rename from src/main/python/mmlspark/doc/index.rst rename to core/src/main/python/mmlspark/doc/index.rst diff --git a/src/main/python/mmlspark/doc/scala.rst b/core/src/main/python/mmlspark/doc/scala.rst similarity index 100% rename from src/main/python/mmlspark/doc/scala.rst rename to core/src/main/python/mmlspark/doc/scala.rst diff --git a/src/main/python/mmlspark/downloader/ModelDownloader.py b/core/src/main/python/mmlspark/downloader/ModelDownloader.py similarity index 100% rename from src/main/python/mmlspark/downloader/ModelDownloader.py rename to core/src/main/python/mmlspark/downloader/ModelDownloader.py diff --git a/src/main/python/mmlspark/cyber/feature/__init__.py b/core/src/main/python/mmlspark/downloader/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/__init__.py rename to core/src/main/python/mmlspark/downloader/__init__.py diff --git a/src/main/python/mmlspark/io/IOImplicits.py b/core/src/main/python/mmlspark/io/IOImplicits.py similarity index 100% rename from src/main/python/mmlspark/io/IOImplicits.py rename to core/src/main/python/mmlspark/io/IOImplicits.py diff --git a/src/main/python/mmlspark/cyber/utils/__init__.py b/core/src/main/python/mmlspark/io/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/__init__.py rename to core/src/main/python/mmlspark/io/__init__.py diff --git a/src/main/python/mmlspark/io/binary/BinaryFileReader.py b/core/src/main/python/mmlspark/io/binary/BinaryFileReader.py similarity index 100% rename from src/main/python/mmlspark/io/binary/BinaryFileReader.py rename to core/src/main/python/mmlspark/io/binary/BinaryFileReader.py diff --git a/src/main/python/mmlspark/downloader/__init__.py b/core/src/main/python/mmlspark/io/binary/__init__.py similarity index 100% rename from src/main/python/mmlspark/downloader/__init__.py rename to core/src/main/python/mmlspark/io/binary/__init__.py diff --git a/src/main/python/mmlspark/io/http/HTTPFunctions.py b/core/src/main/python/mmlspark/io/http/HTTPFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/HTTPFunctions.py rename to core/src/main/python/mmlspark/io/http/HTTPFunctions.py diff --git a/src/main/python/mmlspark/io/http/JSONOutputParser.py b/core/src/main/python/mmlspark/io/http/JSONOutputParser.py similarity index 100% rename from src/main/python/mmlspark/io/http/JSONOutputParser.py rename to core/src/main/python/mmlspark/io/http/JSONOutputParser.py diff --git a/src/main/python/mmlspark/io/http/ServingFunctions.py b/core/src/main/python/mmlspark/io/http/ServingFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/ServingFunctions.py rename to core/src/main/python/mmlspark/io/http/ServingFunctions.py diff --git a/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py b/core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py similarity index 100% rename from src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py rename to core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py diff --git a/src/main/python/mmlspark/image/__init__.py b/core/src/main/python/mmlspark/io/http/__init__.py similarity index 100% rename from src/main/python/mmlspark/image/__init__.py rename to core/src/main/python/mmlspark/io/http/__init__.py diff --git a/src/main/python/mmlspark/io/image/ImageUtils.py b/core/src/main/python/mmlspark/io/image/ImageUtils.py similarity index 100% rename from src/main/python/mmlspark/io/image/ImageUtils.py rename to core/src/main/python/mmlspark/io/image/ImageUtils.py diff --git a/src/main/python/mmlspark/io/__init__.py b/core/src/main/python/mmlspark/io/image/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/__init__.py rename to core/src/main/python/mmlspark/io/image/__init__.py diff --git a/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py b/core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/PowerBIWriter.py rename to core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py diff --git a/src/main/python/mmlspark/io/binary/__init__.py b/core/src/main/python/mmlspark/io/powerbi/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/binary/__init__.py rename to core/src/main/python/mmlspark/io/powerbi/__init__.py diff --git a/src/main/python/mmlspark/nn/ConditionalBallTree.py b/core/src/main/python/mmlspark/nn/ConditionalBallTree.py similarity index 100% rename from src/main/python/mmlspark/nn/ConditionalBallTree.py rename to core/src/main/python/mmlspark/nn/ConditionalBallTree.py diff --git a/src/main/python/mmlspark/io/http/__init__.py b/core/src/main/python/mmlspark/nn/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/http/__init__.py rename to core/src/main/python/mmlspark/nn/__init__.py diff --git a/src/main/python/mmlspark/io/image/__init__.py b/core/src/main/python/mmlspark/plot/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/image/__init__.py rename to core/src/main/python/mmlspark/plot/__init__.py diff --git a/src/main/python/mmlspark/plot/plot.py b/core/src/main/python/mmlspark/plot/plot.py similarity index 100% rename from src/main/python/mmlspark/plot/plot.py rename to core/src/main/python/mmlspark/plot/plot.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py diff --git a/src/main/python/mmlspark/recommendation/SARModel.py b/core/src/main/python/mmlspark/recommendation/SARModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/SARModel.py rename to core/src/main/python/mmlspark/recommendation/SARModel.py diff --git a/src/main/python/mmlspark/recommendation/__init__.py b/core/src/main/python/mmlspark/recommendation/__init__.py similarity index 100% rename from src/main/python/mmlspark/recommendation/__init__.py rename to core/src/main/python/mmlspark/recommendation/__init__.py diff --git a/src/main/python/mmlspark/stages/UDFTransformer.py b/core/src/main/python/mmlspark/stages/UDFTransformer.py similarity index 100% rename from src/main/python/mmlspark/stages/UDFTransformer.py rename to core/src/main/python/mmlspark/stages/UDFTransformer.py diff --git a/src/main/python/mmlspark/io/powerbi/__init__.py b/core/src/main/python/mmlspark/stages/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/__init__.py rename to core/src/main/python/mmlspark/stages/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt diff --git a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala similarity index 53% rename from src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala index 67d667e339e..f573880d07f 100644 --- a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala @@ -4,14 +4,13 @@ package com.microsoft.ml.spark.codegen import java.io.File + import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.codegen.Config._ import com.microsoft.ml.spark.core.env.FileUtilities._ -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing -import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices import org.apache.commons.io.FileUtils import org.apache.commons.io.FilenameUtils._ +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices object CodeGenUtils { def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) @@ -23,14 +22,18 @@ object CodeGen { import CodeGenUtils._ - def generatePythonClasses(): Unit = { - instantiateServices[PythonWrappable].foreach { w => + def generatePythonClasses(jarName: Option[String]): Unit = { + val instantiatedClasses = instantiateServices[PythonWrappable](jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) w.makePyFile() } } - def generateRClasses(): Unit = { - instantiateServices[RWrappable].foreach { w => + def generateRClasses(jarName: Option[String]): Unit = { + val instantiatedClasses = instantiateServices[RWrappable](jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) w.makeRFile() } } @@ -57,7 +60,7 @@ object CodeGen { RSrcDir.mkdirs() writeFile(new File(RSrcDir.getParentFile, "DESCRIPTION"), - s"""|Package: mmlspark + s"""|Package: ${BuildInfo.name} |Title: Access to MMLSpark via R |Description: Provides an interface to MMLSpark. |Version: ${BuildInfo.rVersion} @@ -82,7 +85,7 @@ object CodeGen { | spark_dependency( | jars = c(), | packages = c( - | sprintf("com.microsoft.ml.spark:mmlspark_%s:${BuildInfo.version}", scala_version) + | sprintf("com.microsoft.ml.spark:${BuildInfo.name}_%s:${BuildInfo.version}", scala_version) | ), | repositories = c("https://mmlspark.azureedge.net/maven") | ) @@ -118,59 +121,90 @@ object CodeGen { } - def rGen(): Unit = { + //noinspection ScalaStyle + def generatePyPackageData(): Unit = { + if (!PySrcDir.exists()){ + PySrcDir.mkdir() + } + writeFile(join(PySrcDir, "setup.py"), + s""" + |# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + | + |import os + |from setuptools import setup, find_packages + |import codecs + |import os.path + | + | + |def read(rel_path): + | here = os.path.abspath(os.path.dirname(__file__)) + | with codecs.open(os.path.join(here, rel_path), "r") as fp: + | return fp.read() + | + | + |def get_version(rel_path): + | for line in read(rel_path).splitlines(): + | if line.startswith("__version__"): + | delim = '"' if '"' in line else "'" + | return line.split(delim)[1] + | return "0.0.0" + | + | + |setup( + | name="${BuildInfo.name}", + | version=get_version("mmlspark/__init__.py"), + | description="Microsoft ML for Spark", + | long_description="Microsoft ML for Apache Spark contains Microsoft's open source " + | + "contributions to the Apache Spark ecosystem", + | license="MIT", + | packages=find_packages(), + | url="https://github.com/Azure/mmlspark", + | author="Microsoft", + | author_email="mmlspark-support@microsoft.com", + | classifiers=[ + | "Development Status :: 3 - Alpha", + | "Intended Audience :: Developers", + | "Intended Audience :: Data Scientists", + | "Topic :: Software Development :: Datascience Tools", + | "License :: OSI Approved :: MIT License", + | "Programming Language :: Python :: 2", + | "Programming Language :: Python :: 3", + | ], + | zip_safe=True, + | package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, + |) + | + |""".stripMargin) + } + + + def rGen(jarName: Option[String]): Unit = { + println(s"Generating R for ${jarName}") clean(RSrcRoot) generateRPackageData() - generateRClasses() - FileUtils.copyDirectoryToDirectory(toDir(RSrcOverrideDir), toDir(RSrcDir)) - FileUtils.copyDirectoryToDirectory(toDir(RTestOverrideDir), toDir(RTestDir)) + generateRClasses(jarName) + if (RSrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(RSrcOverrideDir), toDir(RSrcDir)) + if (RTestOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(RTestOverrideDir), toDir(RTestDir)) } - def pyGen(): Unit = { + def pyGen(jarName: Option[String]): Unit = { + println(s"Generating python for ${jarName}") clean(PySrcDir) - generatePythonClasses() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir)) + generatePyPackageData() + generatePythonClasses(jarName) + if (PySrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir)) makeInitFiles() } def main(args: Array[String]): Unit = { clean(PackageDir) - rGen() - pyGen() + rGen(args.headOption) + pyGen(args.headOption) } } -object TestGen { - - import CodeGenUtils._ - - def generatePythonTests(): Unit = { - instantiateServices[PyTestFuzzing[_]].foreach { ltc => - try { - ltc.makePyTestFile() - } catch { - case _: NotImplementedError => - println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") - } - } - } - - private def makeInitFiles(packageFolder: String = ""): Unit = { - val dir = new File(new File(PyTestDir, "mmlsparktest"), packageFolder) - writeFile(new File(dir, "__init__.py"), "") - dir.listFiles().filter(_.isDirectory).foreach(f => - makeInitFiles(packageFolder + "/" + f.getName) - ) - } - - def main(args: Array[String]): Unit = { - clean(TestDataDir) - clean(PyTestDir) - generatePythonTests() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir)) - makeInitFiles() - } -} diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala similarity index 56% rename from src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala index 03785cbd8c9..c473aec0d0c 100644 --- a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala @@ -8,15 +8,14 @@ import java.io.File import com.microsoft.ml.spark.build.BuildInfo object Config { - val DebugMode = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" + val DebugMode: Boolean = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" - val TopDir = BuildInfo.baseDirectory - val Version = BuildInfo.version - val PackageName = BuildInfo.name - val TargetDir = new File(TopDir, s"target/scala-${BuildInfo.scalaVersion.slice(0,4)}") + val TopDir: File = BuildInfo.baseDirectory + val Version: String = BuildInfo.version + val PackageName: String = BuildInfo.name val ScalaSrcDir = "src/main/scala" - val GeneratedDir = new File(TargetDir, "generated") + val GeneratedDir = new File(BuildInfo.targetDir, "generated") val PackageDir = new File(GeneratedDir, "package") val SrcDir = new File(GeneratedDir, "src") val TestDir = new File(GeneratedDir, "test") @@ -42,9 +41,9 @@ object Config { //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip") val InternalPrefix = "_" - val ScopeDepth = " " * 4 + val ScopeDepth: String = " " * 4 - val CopyrightLines = + val CopyrightLines: String = s"""|# Copyright (C) Microsoft Corporation. All rights reserved. |# Licensed under the MIT License. See LICENSE in project root for information. |""".stripMargin @@ -54,13 +53,19 @@ object Config { s"""|$CopyrightLines | |"\"" - |MicrosoftML is a library of Python classes to interface with the - |Microsoft scala APIs to utilize Apache Spark to create distibuted - |machine learning models. + |MMLSpark is an ecosystem of tools aimed towards expanding the distributed computing framework + |Apache Spark in several new directions. MMLSpark adds many deep learning and data science tools to the Spark + |ecosystem, including seamless integration of Spark Machine Learning pipelines with + |Microsoft Cognitive Toolkit (CNTK), LightGBM and OpenCV. These tools enable powerful and + |highly-scalable predictive and analytical models for a variety of datasources. | - |MicrosoftML simplifies training and scoring classifiers and - |regressors, as well as facilitating the creation of models using the - |CNTK library, images, and text. + |MMLSpark also brings new networking capabilities to the Spark Ecosystem. With the HTTP on Spark project, + |users can embed any web service into their SparkML models. In this vein, MMLSpark provides easy to use SparkML + |transformers for a wide variety of Microsoft Cognitive Services. For production grade deployment, + |the Spark Serving project enables high throughput, sub-millisecond latency web services, + |backed by your Spark cluster. + | + |MMLSpark requires Scala 2.11, Spark 2.4+, and Python 3.5+. |"\"" | |__version__ = "${BuildInfo.pythonizedVersion}" diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java b/core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java rename to core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala b/core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala b/core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala index 297dba1de68..db8e39cd033 100644 --- a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala @@ -6,7 +6,7 @@ package com.microsoft.ml.spark.core.utils import java.net.InetAddress import org.apache.http.conn.util.InetAddressUtils -import org.apache.spark.lightgbm.BlockManagerUtils +import org.apache.spark.injections.BlockManagerUtils import org.apache.spark.sql.{Dataset, SparkSession} import org.slf4j.Logger diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala new file mode 100644 index 00000000000..0afa793dcbd --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala @@ -0,0 +1,33 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +import scala.concurrent.duration.Duration +import scala.concurrent.{Await, ExecutionContext, Future} + +object FaultToleranceUtils { + def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ + try { + Await.result(Future(f)(ExecutionContext.global), timeout) + } catch { + case e: Exception if times >= 1 => + print(s"Received exception on call, retrying: $e") + retryWithTimeout(times-1, timeout)(f) + } + } + + val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) + + def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ + try { + f + } catch { + case e: Exception if times.nonEmpty => + println(s"Received exception on call, retrying: $e") + Thread.sleep(times.head) + retryWithTimeout(times.tail)(f) + } + } + +} \ No newline at end of file diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala similarity index 70% rename from src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala index dba98c4a595..3e906d66d7d 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala @@ -1,15 +1,11 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - package com.microsoft.ml.spark.core.utils import java.lang.reflect.Modifier import com.microsoft.ml.spark.codegen.Wrappable import org.sparkproject.guava.reflect.ClassPath - import scala.collection.JavaConverters._ -import scala.reflect.{ClassTag, _} +import scala.reflect.{ClassTag, classTag} /** Contains logic for loading classes. */ object JarLoadingUtils { @@ -41,22 +37,25 @@ object JarLoadingUtils { AllClasses.filter(classOf[Wrappable].isAssignableFrom(_)) } - def instantiateServices[T: ClassTag](instantiate: Class[_] => Any): List[T] = { + def instantiateServices[T: ClassTag](instantiate: Class[_] => Any, jarName: Option[String]): List[T] = { AllClasses .filter(classTag[T].runtimeClass.isAssignableFrom(_)) + .filter(c => jarName.forall(c.getResource(c.getSimpleName + ".class").toString.contains(_))) .filter(clazz => !Modifier.isAbstract(clazz.getModifiers)) .map(instantiate(_)).asInstanceOf[List[T]] } - def instantiateServices[T: ClassTag]: List[T] = instantiateServices[T] { + def instantiateServices[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]({ clazz: Class[_] => clazz.getConstructor().newInstance() - } + }, jarName) - def instantiateObjects[T: ClassTag]: List[T] = instantiateServices[T] { clazz: Class[_] => { - val cons = clazz.getDeclaredConstructors()(0) - cons.setAccessible(true) - cons.newInstance() - }} + def instantiateObjects[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]( + { clazz: Class[_] => { + val cons = clazz.getDeclaredConstructors()(0) + cons.setAccessible(true) + cons.newInstance() + } + }, + jarName) } - diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala new file mode 100644 index 00000000000..4217eaa58a8 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala @@ -0,0 +1,8 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +object OsUtils { + val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 +} \ No newline at end of file diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt diff --git a/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala b/core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala b/core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala b/core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala rename to core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/LIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala b/core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala rename to core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/nn/KNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala index a4c3973a794..2acde7942bf 100644 --- a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala @@ -84,7 +84,7 @@ class KNNModel(val uid: String) extends Model[KNNModel] private var broadcastedModelOption: Option[Broadcast[BallTree[_]]] = None val ballTree = new BallTreeParam(this, "ballTree", - "the ballTree model used for perfoming queries", { _ => true }) + "the ballTree model used for performing queries", { _ => true }) def getBallTree: BallTree[_] = $(ballTree) diff --git a/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt b/core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Explode.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala similarity index 96% rename from src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala index 0e05283c7ba..2e5d435bf14 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala @@ -1,6 +1,3 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - package com.microsoft.ml.spark.stages import java.util.concurrent.LinkedBlockingQueue @@ -19,6 +16,35 @@ import scala.concurrent.blocking object PartitionConsolidator extends DefaultParamsReadable[PartitionConsolidator] +class PartitionConsolidator(val uid: String) + extends Transformer with HTTPParams with HasInputCol + with HasOutputCol + with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("PartitionConsolidator")) + + val consolidatorHolder = SharedSingleton { + new Consolidator[Row]() + } + + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + dataset.toDF().mapPartitions { it => + if (it.hasNext) { + consolidatorHolder.get.registerAndReceive(it).flatten + } else { + Iterator() + } + }(RowEncoder(dataset.schema)) + }) + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = schema +} + class Consolidator[T] { val buffer = new LinkedBlockingQueue[T]() @@ -108,36 +134,8 @@ class Consolidator[T] { } -class PartitionConsolidator(val uid: String) - extends Transformer with HTTPParams with HasInputCol - with HasOutputCol - with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("PartitionConsolidator")) - - val consolidatorHolder = SharedSingleton { - new Consolidator[Row]() - } - - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - dataset.toDF().mapPartitions { it => - if (it.hasNext) { - consolidatorHolder.get.registerAndReceive(it).flatten - } else { - Iterator() - } - }(RowEncoder(dataset.schema)) - }) - } - - override def copy(extra: ParamMap): Transformer = defaultCopy(extra) - - override def transformSchema(schema: StructType): StructType = schema -} - trait LocalAggregator[T] { def prep(iter: Iterator[Row]): T + def merge(ts: Seq[T]): T } diff --git a/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Timer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala index be12d2dcee8..889d1d85225 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala @@ -1,79 +1,79 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.stages - -import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} -import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions.udf - -import java.text.Normalizer -import com.microsoft.ml.spark.codegen.Wrappable -import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} -import com.microsoft.ml.spark.logging.BasicLogging -import org.apache.spark.sql.types.{StringType, StructField, StructType} - -object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] - -/** UnicodeNormalize takes a dataframe and normalizes the unicode representation. - */ -class UnicodeNormalize(val uid: String) extends Transformer - with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("UnicodeNormalize")) - - val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD") - - /** @group getParam */ - def getForm: String = get(form).getOrElse("NFKD") - - /** @group setParam */ - def setForm(value: String): this.type = { - // check input value - Normalizer.Form.valueOf(getForm) - - set("form", value) - } - - val lower = new BooleanParam(this, "lower", "Lowercase text") - - /** @group getParam */ - def getLower: Boolean = get(lower).getOrElse(true) - - /** @group setParam */ - def setLower(value: Boolean): this.type = set("lower", value) - - /** @param dataset - The input dataset, to be transformed - * @return The DataFrame that results from column selection - */ - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - val inputIndex = dataset.columns.indexOf(getInputCol) - - require(inputIndex != -1, s"Input column $getInputCol does not exist") - - val normalizeFunc = (value: String) => - if (value == null) null - else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm)) - - val f = if (getLower) - (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull - else - normalizeFunc - - val textMapper = udf(f) - - dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol)) - }) - } - - def transformSchema(schema: StructType): StructType = { - schema.add(StructField(getOutputCol, StringType)) - } - - def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra) - -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.stages + +import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.udf + +import java.text.Normalizer +import com.microsoft.ml.spark.codegen.Wrappable +import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} +import com.microsoft.ml.spark.logging.BasicLogging +import org.apache.spark.sql.types.{StringType, StructField, StructType} + +object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] + +/** UnicodeNormalize takes a dataframe and normalizes the unicode representation. + */ +class UnicodeNormalize(val uid: String) extends Transformer + with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("UnicodeNormalize")) + + val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD") + + /** @group getParam */ + def getForm: String = get(form).getOrElse("NFKD") + + /** @group setParam */ + def setForm(value: String): this.type = { + // check input value + Normalizer.Form.valueOf(getForm) + + set("form", value) + } + + val lower = new BooleanParam(this, "lower", "Lowercase text") + + /** @group getParam */ + def getLower: Boolean = get(lower).getOrElse(true) + + /** @group setParam */ + def setLower(value: Boolean): this.type = set("lower", value) + + /** @param dataset - The input dataset, to be transformed + * @return The DataFrame that results from column selection + */ + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + val inputIndex = dataset.columns.indexOf(getInputCol) + + require(inputIndex != -1, s"Input column $getInputCol does not exist") + + val normalizeFunc = (value: String) => + if (value == null) null + else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm)) + + val f = if (getLower) + (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull + else + normalizeFunc + + val textMapper = udf(f) + + dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol)) + }) + } + + def transformSchema(schema: StructType): StructType = { + schema.add(StructField(getOutputCol, StringType)) + } + + def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra) + +} diff --git a/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/udfs.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt diff --git a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala similarity index 69% rename from src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala rename to core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala index ee0ba74dd41..d2d8e46bfcf 100644 --- a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala +++ b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala @@ -1,13 +1,11 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package org.apache.spark.lightgbm +package org.apache.spark.injections import org.apache.spark.sql.Dataset import org.apache.spark.storage.BlockManager object BlockManagerUtils { /** Returns the block manager from the dataframe's spark context. + * * @param data The dataframe to get the block manager from. * @return The block manager. */ diff --git a/src/main/scala/org/apache/spark/injections/RegressionUtils.scala b/core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/RegressionUtils.scala rename to core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala diff --git a/src/main/scala/org/apache/spark/injections/SConf.scala b/core/src/main/scala/org/apache/spark/injections/SConf.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/SConf.scala rename to core/src/main/scala/org/apache/spark/injections/SConf.scala diff --git a/src/main/scala/org/apache/spark/injections/UDFUtils.scala b/core/src/main/scala/org/apache/spark/injections/UDFUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/UDFUtils.scala rename to core/src/main/scala/org/apache/spark/injections/UDFUtils.scala diff --git a/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala b/core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala rename to core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala diff --git a/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala rename to core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala diff --git a/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/NamespaceInjections.scala rename to core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala diff --git a/src/main/scala/org/apache/spark/ml/Ranker.scala b/core/src/main/scala/org/apache/spark/ml/Ranker.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/Ranker.scala rename to core/src/main/scala/org/apache/spark/ml/Ranker.scala diff --git a/src/main/scala/org/apache/spark/ml/RegressorUtils.scala b/core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/RegressorUtils.scala rename to core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala diff --git a/src/main/scala/org/apache/spark/ml/Serializer.scala b/core/src/main/scala/org/apache/spark/ml/Serializer.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/Serializer.scala rename to core/src/main/scala/org/apache/spark/ml/Serializer.scala diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt similarity index 100% rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/MapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/MapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/MapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TransformerParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UDFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UDFParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala rename to core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala diff --git a/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala rename to core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala diff --git a/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala rename to core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala diff --git a/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala rename to core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala diff --git a/src/test/R/testthat.R b/core/src/test/R/testthat.R similarity index 100% rename from src/test/R/testthat.R rename to core/src/test/R/testthat.R diff --git a/src/test/R/testthat/setup-spark.R b/core/src/test/R/testthat/setup-spark.R similarity index 100% rename from src/test/R/testthat/setup-spark.R rename to core/src/test/R/testthat/setup-spark.R diff --git a/src/test/R/testthat/test-basic.R b/core/src/test/R/testthat/test-basic.R similarity index 100% rename from src/test/R/testthat/test-basic.R rename to core/src/test/R/testthat/test-basic.R diff --git a/src/test/python/LICENSE.txt b/core/src/test/python/LICENSE.txt similarity index 100% rename from src/test/python/LICENSE.txt rename to core/src/test/python/LICENSE.txt diff --git a/src/test/python/MANIFEST.in b/core/src/test/python/MANIFEST.in similarity index 100% rename from src/test/python/MANIFEST.in rename to core/src/test/python/MANIFEST.in diff --git a/src/main/python/mmlspark/lightgbm/__init__.py b/core/src/test/python/__init__.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/__init__.py rename to core/src/test/python/__init__.py diff --git a/src/main/python/mmlspark/nn/__init__.py b/core/src/test/python/mmlsparktest/__init__.py similarity index 100% rename from src/main/python/mmlspark/nn/__init__.py rename to core/src/test/python/mmlsparktest/__init__.py diff --git a/src/main/python/mmlspark/opencv/__init__.py b/core/src/test/python/mmlsparktest/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/opencv/__init__.py rename to core/src/test/python/mmlsparktest/cyber/__init__.py diff --git a/src/main/python/mmlspark/plot/__init__.py b/core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py similarity index 100% rename from src/main/python/mmlspark/plot/__init__.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py diff --git a/src/test/python/mmlsparktest/cyber/explain_tester.py b/core/src/test/python/mmlsparktest/cyber/explain_tester.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/explain_tester.py rename to core/src/test/python/mmlsparktest/cyber/explain_tester.py diff --git a/src/main/python/mmlspark/stages/__init__.py b/core/src/test/python/mmlsparktest/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/stages/__init__.py rename to core/src/test/python/mmlsparktest/cyber/feature/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/feature/test_indexers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/test_indexers.py rename to core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py diff --git a/src/test/python/mmlsparktest/cyber/feature/test_scalers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/test_scalers.py rename to core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py diff --git a/src/main/python/mmlspark/vw/__init__.py b/core/src/test/python/mmlsparktest/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/vw/__init__.py rename to core/src/test/python/mmlsparktest/cyber/utils/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py b/core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py rename to core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py diff --git a/src/test/__init__.py b/core/src/test/python/mmlsparktest/nn/__init__.py similarity index 100% rename from src/test/__init__.py rename to core/src/test/python/mmlsparktest/nn/__init__.py diff --git a/src/test/python/mmlsparktest/nn/test_ball_tree.py b/core/src/test/python/mmlsparktest/nn/test_ball_tree.py similarity index 100% rename from src/test/python/mmlsparktest/nn/test_ball_tree.py rename to core/src/test/python/mmlsparktest/nn/test_ball_tree.py diff --git a/src/test/python/__init__.py b/core/src/test/python/mmlsparktest/recommendation/__init__.py similarity index 100% rename from src/test/python/__init__.py rename to core/src/test/python/mmlsparktest/recommendation/__init__.py diff --git a/src/test/python/mmlsparktest/recommendation/test_ranking.py b/core/src/test/python/mmlsparktest/recommendation/test_ranking.py similarity index 100% rename from src/test/python/mmlsparktest/recommendation/test_ranking.py rename to core/src/test/python/mmlsparktest/recommendation/test_ranking.py diff --git a/src/test/python/mmlsparktest/spark.py b/core/src/test/python/mmlsparktest/spark.py similarity index 100% rename from src/test/python/mmlsparktest/spark.py rename to core/src/test/python/mmlsparktest/spark.py diff --git a/src/test/python/setup.py b/core/src/test/python/setup.py similarity index 100% rename from src/test/python/setup.py rename to core/src/test/python/setup.py diff --git a/src/test/resources/audio1.txt b/core/src/test/resources/audio1.txt similarity index 100% rename from src/test/resources/audio1.txt rename to core/src/test/resources/audio1.txt diff --git a/src/test/resources/audio1.wav b/core/src/test/resources/audio1.wav similarity index 100% rename from src/test/resources/audio1.wav rename to core/src/test/resources/audio1.wav diff --git a/src/test/resources/audio2.txt b/core/src/test/resources/audio2.txt similarity index 100% rename from src/test/resources/audio2.txt rename to core/src/test/resources/audio2.txt diff --git a/src/test/resources/audio2.wav b/core/src/test/resources/audio2.wav similarity index 100% rename from src/test/resources/audio2.wav rename to core/src/test/resources/audio2.wav diff --git a/src/test/resources/audio3.mp3 b/core/src/test/resources/audio3.mp3 similarity index 100% rename from src/test/resources/audio3.mp3 rename to core/src/test/resources/audio3.mp3 diff --git a/src/test/resources/audio3.txt b/core/src/test/resources/audio3.txt similarity index 100% rename from src/test/resources/audio3.txt rename to core/src/test/resources/audio3.txt diff --git a/src/test/resources/audio4.txt b/core/src/test/resources/audio4.txt similarity index 100% rename from src/test/resources/audio4.txt rename to core/src/test/resources/audio4.txt diff --git a/src/test/resources/benchmarks/benchmarkBasicDataTypes.json b/core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkBasicDataTypes.json rename to core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json diff --git a/src/test/resources/benchmarks/benchmarkDate.json b/core/src/test/resources/benchmarks/benchmarkDate.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkDate.json rename to core/src/test/resources/benchmarks/benchmarkDate.json diff --git a/src/test/resources/benchmarks/benchmarkNoOneHot.json b/core/src/test/resources/benchmarks/benchmarkNoOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkNoOneHot.json rename to core/src/test/resources/benchmarks/benchmarkNoOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkOneHot.json b/core/src/test/resources/benchmarks/benchmarkOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkOneHot.json rename to core/src/test/resources/benchmarks/benchmarkOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkString.json b/core/src/test/resources/benchmarks/benchmarkString.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkString.json rename to core/src/test/resources/benchmarks/benchmarkString.json diff --git a/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json b/core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkStringIndexOneHot.json rename to core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkStringMissing.json b/core/src/test/resources/benchmarks/benchmarkStringMissing.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkStringMissing.json rename to core/src/test/resources/benchmarks/benchmarkStringMissing.json diff --git a/src/test/resources/benchmarks/benchmarkVectors.json b/core/src/test/resources/benchmarks/benchmarkVectors.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkVectors.json rename to core/src/test/resources/benchmarks/benchmarkVectors.json diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv diff --git a/src/test/resources/demoUsage.csv.gz b/core/src/test/resources/demoUsage.csv.gz similarity index 100% rename from src/test/resources/demoUsage.csv.gz rename to core/src/test/resources/demoUsage.csv.gz diff --git a/src/test/resources/dialogue.mp3 b/core/src/test/resources/dialogue.mp3 similarity index 100% rename from src/test/resources/dialogue.mp3 rename to core/src/test/resources/dialogue.mp3 diff --git a/src/test/resources/lily.wav b/core/src/test/resources/lily.wav similarity index 100% rename from src/test/resources/lily.wav rename to core/src/test/resources/lily.wav diff --git a/src/test/resources/mark.wav b/core/src/test/resources/mark.wav similarity index 100% rename from src/test/resources/mark.wav rename to core/src/test/resources/mark.wav diff --git a/src/test/resources/sim_count1.csv.gz b/core/src/test/resources/sim_count1.csv.gz similarity index 100% rename from src/test/resources/sim_count1.csv.gz rename to core/src/test/resources/sim_count1.csv.gz diff --git a/src/test/resources/sim_count3.csv.gz b/core/src/test/resources/sim_count3.csv.gz similarity index 100% rename from src/test/resources/sim_count3.csv.gz rename to core/src/test/resources/sim_count3.csv.gz diff --git a/src/test/resources/sim_jac1.csv.gz b/core/src/test/resources/sim_jac1.csv.gz similarity index 100% rename from src/test/resources/sim_jac1.csv.gz rename to core/src/test/resources/sim_jac1.csv.gz diff --git a/src/test/resources/sim_jac3.csv.gz b/core/src/test/resources/sim_jac3.csv.gz similarity index 100% rename from src/test/resources/sim_jac3.csv.gz rename to core/src/test/resources/sim_jac3.csv.gz diff --git a/src/test/resources/sim_lift1.csv.gz b/core/src/test/resources/sim_lift1.csv.gz similarity index 100% rename from src/test/resources/sim_lift1.csv.gz rename to core/src/test/resources/sim_lift1.csv.gz diff --git a/src/test/resources/sim_lift3.csv.gz b/core/src/test/resources/sim_lift3.csv.gz similarity index 100% rename from src/test/resources/sim_lift3.csv.gz rename to core/src/test/resources/sim_lift3.csv.gz diff --git a/src/test/resources/user_aff.csv.gz b/core/src/test/resources/user_aff.csv.gz similarity index 100% rename from src/test/resources/user_aff.csv.gz rename to core/src/test/resources/user_aff.csv.gz diff --git a/src/test/resources/userpred_count3_userid_only.csv.gz b/core/src/test/resources/userpred_count3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_count3_userid_only.csv.gz rename to core/src/test/resources/userpred_count3_userid_only.csv.gz diff --git a/src/test/resources/userpred_jac3_userid_only.csv.gz b/core/src/test/resources/userpred_jac3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_jac3_userid_only.csv.gz rename to core/src/test/resources/userpred_jac3_userid_only.csv.gz diff --git a/src/test/resources/userpred_lift3_userid_only.csv.gz b/core/src/test/resources/userpred_lift3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_lift3_userid_only.csv.gz rename to core/src/test/resources/userpred_lift3_userid_only.csv.gz diff --git a/src/test/scala/com/microsoft/ml/spark/Secrets.scala b/core/src/test/scala/com/microsoft/ml/spark/Secrets.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/Secrets.scala rename to core/src/test/scala/com/microsoft/ml/spark/Secrets.scala diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala diff --git a/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala new file mode 100644 index 00000000000..6da7e4f4352 --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala @@ -0,0 +1,47 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import com.microsoft.ml.spark.codegen.Config._ +import com.microsoft.ml.spark.core.env.FileUtilities._ +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices +import org.apache.commons.io.FileUtils + + +object TestGen { + + import CodeGenUtils._ + + def generatePythonTests(): Unit = { + instantiateServices[PyTestFuzzing[_]]().foreach { ltc => + try { + ltc.makePyTestFile() + } catch { + case _: NotImplementedError => + println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") + } + } + } + + private def makeInitFiles(packageFolder: String = ""): Unit = { + val dir = new File(new File(PyTestDir, "mmlsparktest"), packageFolder) + writeFile(new File(dir, "__init__.py"), "") + dir.listFiles().filter(_.isDirectory).foreach(f => + makeInitFiles(packageFolder + "/" + f.getName) + ) + } + + def main(args: Array[String]): Unit = { + clean(TestDataDir) + clean(PyTestDir) + generatePythonTests() + TestBase.stopSparkSession() + FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir)) + makeInitFiles() + } +} diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala b/core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala index ce573f761d9..67d31910dc9 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala @@ -257,17 +257,17 @@ class FuzzingTest extends TestBase { // set the context loader to pick up on the jars //Thread.currentThread().setContextClassLoader(JarLoadingUtils.classLoader) - private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]] + private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]]() - private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage] + private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage]() private lazy val experimentFuzzers: List[ExperimentFuzzing[_ <: PipelineStage]] = - JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]] + JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]]() private lazy val serializationFuzzers: List[SerializationFuzzing[_ <: PipelineStage with MLWritable]] = - JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]] + JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]]() private lazy val pytestFuzzers: List[PyTestFuzzing[_ <: PipelineStage]] = - JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]] + JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]]() } diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala b/core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala diff --git a/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala new file mode 100644 index 00000000000..a9ae8cd4f7e --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala @@ -0,0 +1,106 @@ +package com.microsoft.ml.spark.image + +import java.io.File +import java.net.URL + +import com.microsoft.ml.spark.build.BuildInfo +import com.microsoft.ml.spark.core.env.FileUtilities +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.{DataFrame, SparkSession} +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.io.IOImplicits.dfrToDfre +import org.apache.commons.io.FileUtils +import org.apache.spark.sql.functions.col + +trait ImageTestUtils extends TestBase { + + val filesRoot = BuildInfo.datasetDir.toString + val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString + val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString + val inputCol = "cntk_images" + val outputCol = "out" + val labelCol = "labels" + + val featureVectorLength = 3 * 32 * 32 + lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString + + def testModelDF(spark: SparkSession): DataFrame = { + import spark.implicits._ + spark.sparkContext.parallelize(Seq( + Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, + -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), + Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, + -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), + Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, + 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), + Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, + -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), + Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, + 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), + Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, + 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF + } + + def testImages(spark: SparkSession): DataFrame = { + val images = spark.read.image.load(imagePath) + + val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) + + unroll.transform(images).select(inputCol) + } + + def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { + import spark.implicits._ + if (outputDouble) { + List + .fill(rows)(List.fill(size)(0.0).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } else { + List + .fill(rows)(List.fill(size)(0.0.toFloat).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } + } + + protected def compareToTestModel(result: DataFrame) = { + //TODO improve checks + assert(result.columns.toSet == Set(inputCol, outputCol)) + assert(result.count() == testModelDF(result.sparkSession).count()) + val max = result + .select(outputCol) + .collect() + .map(row => row.getAs[DenseVector](0).toArray.max) + .max + assert(max < 10 & max > -10) + } + + lazy val images: DataFrame = spark.read.image.load(imagePath) + .withColumnRenamed("image", inputCol) + lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath) + .select(col("value.bytes").alias(inputCol)) + + lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery") + lazy val groceryImages: DataFrame = spark.read.image + .option("dropInvalid", true) + .load(groceriesPath + "**") + .withColumnRenamed("image", inputCol) + + lazy val greyscaleImageLocation: String = { + val loc = "/tmp/greyscale.jpg" + val f = new File(loc) + if (f.exists()) {f.delete()} + FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f) + loc + } + + lazy val greyscaleImage: DataFrame = spark + .read.image.load(greyscaleImageLocation) + .select(col("image").alias(inputCol)) + + lazy val greyscaleBinary: DataFrame = spark + .read.binary.load(greyscaleImageLocation) + .select(col("value.bytes").alias(inputCol)) + +} diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala index 13592cec90b..b611ef5158e 100644 --- a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala @@ -5,7 +5,7 @@ package com.microsoft.ml.spark.io.split1 import java.io.{File, FileInputStream} -import com.microsoft.ml.spark.cognitive.OsUtils +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.schema.ImageSchemaUtils import com.microsoft.ml.spark.core.test.base.TestBase diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala rename to core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala diff --git a/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala new file mode 100644 index 00000000000..b58e597944b --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala @@ -0,0 +1,66 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.lime + +import breeze.linalg.{*, DenseMatrix} +import breeze.stats.distributions.Rand +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.ml.param.DataFrameEquality +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.util.MLReadable + +trait LimeTestBase extends TestBase { + + import spark.implicits._ + + lazy val nRows = 100 + lazy val d1 = 3 + lazy val d2 = 1 + + lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0)) + lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian) + lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1 + lazy val y = x * m //+ noise + + lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray)) + lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0)) + lazy val df = xRows.zip(yRows).toDF("features", "label") + + lazy val model = new LinearRegression().fit(df) + + lazy val lime = new TabularLIME() + .setModel(model) + .setInputCol("features") + .setPredictionCol(model.getPredictionCol) + .setOutputCol("out") + .setNSamples(1000) + + lazy val limeModel = lime.fit(df) +} + +class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with + DataFrameEquality with LimeTestBase { + + test("text lime usage test check") { + val results = limeModel.transform(df).select("out") + .collect().map(_.getAs[DenseVector](0)) + results.foreach(result => assert(result === new DenseVector(m.data))) + } + + override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df)) + + override def reader: MLReadable[_] = TabularLIME + + override def modelReader: MLReadable[_] = TabularLIMEModel +} + +class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with + DataFrameEquality with LimeTestBase { + + override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df)) + + override def reader: MLReadable[_] = TabularLIMEModel +} diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala similarity index 96% rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala index 5d2c26e330f..289720f9691 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala @@ -7,13 +7,13 @@ import java.awt.Color import java.awt.image.BufferedImage import java.io.File -import com.microsoft.ml.spark.cntk.CNTKTestUtils +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.image.ImageUtils import javax.imageio.ImageIO import scala.util.Random -class SuperpixelSuite extends CNTKTestUtils { +class SuperpixelSuite extends ImageTestUtils { lazy val sp1 = new Superpixel(img, 16, 130) lazy val sp2 = new Superpixel(img2, 100, 130) diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala similarity index 90% rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala index 881aefed41a..0c4a5b78d0b 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala @@ -4,12 +4,12 @@ package com.microsoft.ml.spark.lime import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.image.NetworkUtils +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import org.apache.spark.ml.util.MLReadable class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer] - with NetworkUtils with FileReaderUtils { + with ImageTestUtils with FileReaderUtils { lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol) test("basic functionality"){ diff --git a/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala index 641afac6265..359fc06babf 100644 --- a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala @@ -42,9 +42,7 @@ object DatabricksUtilities extends HasHttpClient { val Folder = s"/MMLSparkBuild/build_${BuildInfo.version}" // MMLSpark info - val TruncatedScalaVersion: String = BuildInfo.scalaVersion - .split(".".toCharArray.head).dropRight(1).mkString(".") - val Version = s"com.microsoft.ml.spark:${BuildInfo.name}_$TruncatedScalaVersion:${BuildInfo.version}" + val Version = s"com.microsoft.ml.spark:mmlspark:${BuildInfo.version}" val Repository = "https://mmlspark.azureedge.net/maven" val Libraries: String = List( diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala diff --git a/src/main/R/model_downloader.R b/deep-learning/src/main/R/model_downloader.R similarity index 100% rename from src/main/R/model_downloader.R rename to deep-learning/src/main/R/model_downloader.R diff --git a/src/main/python/mmlspark/cntk/CNTKModel.py b/deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py similarity index 100% rename from src/main/python/mmlspark/cntk/CNTKModel.py rename to deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py diff --git a/src/test/python/mmlsparktest/__init__.py b/deep-learning/src/main/python/mmlspark/cntk/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/__init__.py rename to deep-learning/src/main/python/mmlspark/cntk/__init__.py diff --git a/src/main/python/mmlspark/image/ImageFeaturizer.py b/deep-learning/src/main/python/mmlspark/image/ImageFeaturizer.py similarity index 100% rename from src/main/python/mmlspark/image/ImageFeaturizer.py rename to deep-learning/src/main/python/mmlspark/image/ImageFeaturizer.py diff --git a/src/test/python/mmlsparktest/cognitive/__init__.py b/deep-learning/src/main/python/mmlspark/image/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/__init__.py rename to deep-learning/src/main/python/mmlspark/image/__init__.py diff --git a/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala b/deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala similarity index 100% rename from src/main/scala/com/microsoft/CNTK/SerializableFunction.scala rename to deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala similarity index 91% rename from src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala index 3b68d0ee507..54f890242b4 100644 --- a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala +++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala @@ -7,6 +7,7 @@ import java.io._ import java.net.{URI, URL} import java.util +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.{Configuration => HadoopConf} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} @@ -15,10 +16,8 @@ import org.apache.log4j.LogManager import org.apache.spark.sql.SparkSession import spray.json._ -import scala.annotation.tailrec import scala.collection.JavaConverters._ -import scala.concurrent.duration.{Duration, FiniteDuration} -import scala.concurrent.{Await, ExecutionContext, Future} +import scala.concurrent.duration.Duration /** Abstract representation of a repository for future expansion * @@ -34,32 +33,6 @@ private[spark] abstract class Repository[S <: Schema] { } -object FaultToleranceUtils { - def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ - try { - Await.result(Future(f)(ExecutionContext.global), timeout) - } catch { - case e: Exception if times >= 1 => - print(s"Received exception on call, retrying: $e") - retryWithTimeout(times-1, timeout)(f) - } - } - - val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) - - def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ - try { - f - } catch { - case e: Exception if times.nonEmpty => - println(s"Received exception on call, retrying: $e") - Thread.sleep(times.head) - retryWithTimeout(times.tail)(f) - } - } - -} - /** Exception returned if a repo cannot find the file * * @param uri : location of the file diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala index 2db42e83b0c..e7dc8c7b4a1 100644 --- a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala +++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala @@ -132,7 +132,7 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with /** @group getParam */ def getLayerNames: Array[String] = $(layerNames) - setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa->true) + setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa -> true) override def transform(dataset: Dataset[_]): DataFrame = { logTransform[DataFrame]({ @@ -194,4 +194,4 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with schema.add(getOutputCol, VectorType) } -} +} \ No newline at end of file diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt rename to deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala index 37b4b1ad615..f8483945360 100644 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala @@ -9,11 +9,12 @@ import com.microsoft.CNTK.CNTKExtensions._ import com.microsoft.CNTK.{SerializableFunction => CNTKFunction, _} import com.microsoft.ml.spark.core.env.StreamUtilities._ import com.microsoft.ml.spark.core.test.base.LinuxOnly +import com.microsoft.ml.spark.image.ImageTestUtils import org.apache.commons.io.IOUtils import scala.collection.JavaConverters._ -class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils { +class CNTKBindingSuite extends LinuxOnly with ImageTestUtils { def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = { (0 until fvv.size.toInt).map(i => diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala index 34893a7015c..8d2285be0ad 100644 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala @@ -10,6 +10,7 @@ import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.base.LinuxOnly import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.image.ImageTestUtils import org.apache.commons.io.FileUtils import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.DenseVector @@ -21,7 +22,7 @@ import org.apache.spark.sql.types._ import scala.util.Random -class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzing[CNTKModel] { +class CNTKModelSuite extends LinuxOnly with ImageTestUtils with TransformerFuzzing[CNTKModel] { // TODO: Move away from getTempDirectoryPath and have TestBase provide one @@ -54,7 +55,7 @@ class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzin .setOutputNodeIndex(0) } - lazy val images = testImages(spark) + override lazy val images = testImages(spark) import spark.implicits._ diff --git a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala index ee6d53933a0..f67e4b82d5c 100644 --- a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala @@ -7,6 +7,7 @@ import java.io.File import java.nio.file.Files import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.commons.io.FileUtils import scala.collection.JavaConverters._ diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala similarity index 81% rename from src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala index 247c7a421e1..6733d1fa674 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala @@ -8,24 +8,20 @@ import java.net.{URI, URL} import com.microsoft.ml.spark.Secrets import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.cntk.CNTKTestUtils import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.downloader.{ModelDownloader, ModelSchema} import com.microsoft.ml.spark.io.IOImplicits._ import com.microsoft.ml.spark.io.powerbi.PowerBIWriter import com.microsoft.ml.spark.io.split1.FileReaderUtils -import org.apache.commons.io.FileUtils import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StringType -trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { +trait TrainedCNTKModelUtils extends ImageTestUtils with FileReaderUtils { lazy val modelDir = new File(filesRoot, "CNTKModel") lazy val modelDownloader = new ModelDownloader(spark, modelDir.toURI) @@ -33,33 +29,6 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50") - lazy val images: DataFrame = spark.read.image.load(imagePath) - .withColumnRenamed("image", inputCol) - lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath) - .select(col("value.bytes").alias(inputCol)) - - lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery") - lazy val groceryImages: DataFrame = spark.read.image - .option("dropInvalid", true) - .load(groceriesPath + "**") - .withColumnRenamed("image", inputCol) - - lazy val greyscaleImageLocation: String = { - val loc = "/tmp/greyscale.jpg" - val f = new File(loc) - if (f.exists()) {f.delete()} - FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f) - loc - } - - lazy val greyscaleImage: DataFrame = spark - .read.image.load(greyscaleImageLocation) - .select(col("image").alias(inputCol)) - - lazy val greyscaleBinary: DataFrame = spark - .read.binary.load(greyscaleImageLocation) - .select(col("value.bytes").alias(inputCol)) - def resNetModel(): ImageFeaturizer = new ImageFeaturizer() .setInputCol(inputCol) .setOutputCol(outputCol) @@ -68,7 +37,7 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { } class ImageFeaturizerSuite extends TransformerFuzzing[ImageFeaturizer] - with NetworkUtils { + with TrainedCNTKModelUtils { test("Image featurizer should reproduce the CIFAR10 experiment") { print(spark) diff --git a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala similarity index 65% rename from src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala index e83f910e377..892ba9823d8 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala @@ -7,82 +7,23 @@ import java.awt.image.BufferedImage import java.io.File import java.net.URL -import breeze.linalg.{*, DenseMatrix} -import breeze.stats.distributions.Rand -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils} +import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.image.{ImageFeaturizer, TrainedCNTKModelUtils} import com.microsoft.ml.spark.io.IOImplicits._ import com.microsoft.ml.spark.io.image.ImageUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import com.microsoft.ml.spark.stages.UDFTransformer import com.microsoft.ml.spark.stages.udfs.get_value_udf import org.apache.commons.io.FileUtils -import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.param.DataFrameEquality -import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.MLReadable import org.apache.spark.ml.{NamespaceInjections, PipelineModel} import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.{DataFrame, Row} -trait LimeTestBase extends TestBase { - - import spark.implicits._ - - lazy val nRows = 100 - lazy val d1 = 3 - lazy val d2 = 1 - - lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0)) - lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian) - lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1 - lazy val y = x * m //+ noise - - lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray)) - lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0)) - lazy val df = xRows.zip(yRows).toDF("features", "label") - - lazy val model = new LinearRegression().fit(df) - - lazy val lime = new TabularLIME() - .setModel(model) - .setInputCol("features") - .setPredictionCol(model.getPredictionCol) - .setOutputCol("out") - .setNSamples(1000) - - lazy val limeModel = lime.fit(df) -} - -class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with - DataFrameEquality with LimeTestBase { - - test("text lime usage test check") { - val results = limeModel.transform(df).select("out") - .collect().map(_.getAs[DenseVector](0)) - results.foreach(result => assert(result === new DenseVector(m.data))) - } - - override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df)) - - override def reader: MLReadable[_] = TabularLIME - - override def modelReader: MLReadable[_] = TabularLIMEModel -} - -class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with - DataFrameEquality with LimeTestBase { - - override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df)) - - override def reader: MLReadable[_] = TabularLIMEModel -} - class ImageLIMESuite extends TransformerFuzzing[ImageLIME] with - DataFrameEquality with NetworkUtils with FileReaderUtils { + DataFrameEquality with TrainedCNTKModelUtils with FileReaderUtils { lazy val greyhoundImageLocation: String = { val loc = "/tmp/greyhound.jpg" diff --git a/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py diff --git a/src/test/python/mmlsparktest/cyber/__init__.py b/lightgbm/src/main/python/mmlspark/lightgbm/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/__init__.py rename to lightgbm/src/main/python/mmlspark/lightgbm/__init__.py diff --git a/src/main/python/mmlspark/lightgbm/mixin.py b/lightgbm/src/main/python/mmlspark/lightgbm/mixin.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/mixin.py rename to lightgbm/src/main/python/mmlspark/lightgbm/mixin.py diff --git a/src/main/scala/com/microsoft/lightgbm/SWIG.scala b/lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala similarity index 100% rename from src/main/scala/com/microsoft/lightgbm/SWIG.scala rename to lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala index 6fc82765b8e..ccecaae33ec 100644 --- a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala +++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala @@ -8,7 +8,7 @@ import java.net._ import com.microsoft.ml.lightgbm._ import com.microsoft.ml.spark.core.env.StreamUtilities._ -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import com.microsoft.ml.spark.lightgbm.booster.LightGBMBooster import com.microsoft.ml.spark.lightgbm.dataset.LightGBMDataset import com.microsoft.ml.spark.lightgbm.params.{ClassifierTrainParams, TrainParams} diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala diff --git a/src/main/python/mmlspark/opencv/ImageTransformer.py b/opencv/src/main/python/mmlspark/opencv/ImageTransformer.py similarity index 100% rename from src/main/python/mmlspark/opencv/ImageTransformer.py rename to opencv/src/main/python/mmlspark/opencv/ImageTransformer.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/__init__.py b/opencv/src/main/python/mmlspark/opencv/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/__init__.py rename to opencv/src/main/python/mmlspark/opencv/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala b/opencv/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala index 5d05a243ccf..b20b309bb05 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala @@ -8,15 +8,15 @@ import java.net.URL import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.opencv.{ImageTestUtils, ImageTransformer} +import com.microsoft.ml.spark.io.IOImplicits._ +import com.microsoft.ml.spark.opencv.{ImageTransformer, OpenCVTestUtils} +import org.apache.commons.io.FileUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Row} -import com.microsoft.ml.spark.io.IOImplicits._ -import org.apache.commons.io.FileUtils class ResizeImageTransformerSuite extends TransformerFuzzing[ResizeImageTransformer] - with ImageTestUtils { + with OpenCVTestUtils { lazy val images: DataFrame = spark.read.image .option("dropInvalid", true).load(FileUtilities.join(fileLocation, "**").toString) diff --git a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala index 6c7ab6dfe53..62a43aa5e93 100644 --- a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala @@ -23,7 +23,7 @@ import org.opencv.imgproc.Imgproc import org.scalactic.Equality import org.scalatest.Assertion -trait ImageTestUtils { +trait OpenCVTestUtils { lazy protected val fileLocation = FileUtilities.join(BuildInfo.datasetDir, "Images", "Grocery") protected def selectTestImageBytes(images: DataFrame): Array[Byte] = { @@ -81,7 +81,7 @@ trait ImageTestUtils { } -class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUtils with DataFrameEquality { +class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with OpenCVTestUtils with DataFrameEquality { lazy val filesRoot = BuildInfo.datasetDir lazy val imagePath = FileUtilities.join(filesRoot,"Images", "CIFAR").toString @@ -128,7 +128,7 @@ class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUti } class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage] - with ImageTestUtils with DataFrameEquality { + with OpenCVTestUtils with DataFrameEquality { lazy val filesRoot = BuildInfo.datasetDir lazy val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString @@ -163,7 +163,7 @@ class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage] override def reader: UnrollBinaryImage.type = UnrollBinaryImage } -class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with ImageTestUtils { +class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with OpenCVTestUtils { //TODO this is needed to stop the build from freezing override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { diff --git a/project/BlobMavenPlugin.scala b/project/BlobMavenPlugin.scala new file mode 100644 index 00000000000..de8114172e0 --- /dev/null +++ b/project/BlobMavenPlugin.scala @@ -0,0 +1,48 @@ +import java.io.File + +import BlobMavenPlugin.autoImport.publishBlob +import BuildUtils.{join, uploadToBlob} +import sbt._ +import Keys._ +import org.apache.ivy.core.IvyPatternHelper + +//noinspection ScalaStyle +object BlobMavenPlugin extends AutoPlugin { + override def trigger = allRequirements + + object autoImport { + val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") + val blobArtifactInfo = SettingKey[String]("blobArtifactInfo") + } + + import autoImport._ + + override def requires: Plugins = sbt.Plugins.empty + + override lazy val projectSettings: Seq[Setting[_]] = Seq( + publishBlob := { + publishM2.value + //TODO make this more general - 1.0 is a hack and not sure of a way to get this with sbt keys + val sourceArtifactName = s"${moduleName.value}_${scalaBinaryVersion.value}_1.0" + val destArtifactName = s"${moduleName.value}" + val repositoryDir = new File(new URI(Resolver.mavenLocal.root)) + val orgDirs = organization.value.split(".".toCharArray.head) + val localPackageFolder = join(repositoryDir, orgDirs ++ Seq(sourceArtifactName, version.value):_*).toString + val blobMavenFolder = (orgDirs ++ Seq(destArtifactName, version.value)).mkString("/") + uploadToBlob(localPackageFolder, blobMavenFolder, "maven") + println(blobArtifactInfo.value) + }, + blobArtifactInfo := { + s""" + |MMLSpark Build and Release Information + |--------------- + | + |### Maven Coordinates + | `${organization.value}:${moduleName.value}:${version.value}` + | + |### Maven Resolver + | `https://mmlspark.azureedge.net/maven` + |""".stripMargin + } + ) +} \ No newline at end of file diff --git a/project/CodegenPlugin.scala b/project/CodegenPlugin.scala new file mode 100644 index 00000000000..0f0270dd150 --- /dev/null +++ b/project/CodegenPlugin.scala @@ -0,0 +1,192 @@ +import java.io.File + +import BuildUtils.{join, runCmd, singleUploadToBlob, uploadToBlob, zipFolder} +import CondaPlugin.autoImport.{activateCondaEnv, condaEnvLocation, createCondaEnvTask} +import org.apache.commons.io.FileUtils +import sbt.Keys._ +import sbt._ +import sbtbuildinfo.BuildInfoPlugin +import sbtunidoc.ScalaUnidocPlugin + +//noinspection ScalaStyle +object CodegenPlugin extends AutoPlugin { + override def trigger = allRequirements + + override def requires: Plugins = CondaPlugin && BuildInfoPlugin + + def rCmd(activateCondaEnv: Seq[String], cmd: Seq[String], wd: File, libPath: String): Unit = { + runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) + } + + object autoImport { + val pythonizedVersion = settingKey[String]("Pythonized version") + val rVersion = settingKey[String]("R version") + val genPackageNamespace = settingKey[String]("genPackageNamespace") + val genTestPackageNamespace = settingKey[String]("genTestPackageNamespace") + val genJarName = settingKey[Option[String]]("genJarName") + + val targetDir = settingKey[File]("targetDir") + val codegenDir = settingKey[File]("codegenDir") + + val codegen = TaskKey[Unit]("codegen", "Generate Code") + val testgen = TaskKey[Unit]("testgen", "Generate Tests") + + val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") + val publishR = TaskKey[Unit]("publishR", "publish R package to blob") + val testR = TaskKey[Unit]("testR", "Run testthat on R tests") + + val packagePython = TaskKey[Unit]("packagePython", "Package python sdk") + val installPipPackage = TaskKey[Unit]("installPipPackage", "install python sdk") + val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") + val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") + val testPython = TaskKey[Unit]("testPython", "test python sdk") + } + + import autoImport._ + import sbtbuildinfo.BuildInfoPlugin.autoImport._ + + override lazy val projectSettings: Seq[Setting[_]] = Seq( + buildInfoKeys ++= Seq[BuildInfoKey]( + pythonizedVersion, + rVersion, + genPackageNamespace, + genTestPackageNamespace, + targetDir, + codegenDir, + name, + version, + scalaVersion, + sbtVersion, + baseDirectory + ), + genJarName := { + Some(artifactName.value( + ScalaVersion(scalaVersion.value, scalaBinaryVersion.value), + projectID.value, + artifact.value)) + }, + codegen := (Def.taskDyn { + (Compile / compile).value + (Test / compile).value + val arg = genJarName.value.map(s => " " + s).getOrElse("") + Def.task { + (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.CodeGen$arg").value + } + }.value), + testgen := (Def.taskDyn { + (Compile / compile).value + (Test / compile).value + val arg = genJarName.value.map(s => " " + s).getOrElse("") + Def.task { + (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.TestGen$arg").value + } + }.value), + pythonizedVersion := { + if (version.value.contains("-")) { + version.value.split("-".head).head + ".dev1" + } else { + version.value + } + }, + rVersion := { + if (version.value.contains("-")) { + version.value.split("-".head).head + } else { + version.value + } + }, + packageR := { + createCondaEnvTask.value + codegen.value + val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value) + val rPackageDir = join(codegenDir.value, "package", "R") + val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString + rCmd(activateCondaEnv.value, Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) + rPackageDir.mkdirs() + zipFolder(rSrcDir, new File(rPackageDir, s"${name.value}-${version.value}.zip")) + }, + testR := { + packageR.value + publishLocal.value + val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString + val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value) + rCmd(activateCondaEnv.value, + Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", genPackageNamespace.value), + rSrcDir.getParentFile, libPath) + val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath + rCmd(activateCondaEnv.value, + Seq("Rscript", testRunner), rSrcDir, libPath) + }, + publishR := { + codegen.value + packageR.value + val rPackageDir = join(codegenDir.value, "package", "R") + val rPackage = rPackageDir.listFiles().head + singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") + }, + packagePython := { + codegen.value + createCondaEnvTask.value + val destPyDir = join(targetDir.value, "classes", genPackageNamespace.value) + val packageDir = join(codegenDir.value, "package", "python").absolutePath + val pythonSrcDir = join(codegenDir.value, "src", "python") + if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) + val sourcePyDir = join(pythonSrcDir.getAbsolutePath, genPackageNamespace.value) + FileUtils.copyDirectory(sourcePyDir, destPyDir) + runCmd( + activateCondaEnv.value ++ + Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", packageDir), + pythonSrcDir) + }, + installPipPackage := { + packagePython.value + publishLocal.value + runCmd( + activateCondaEnv.value ++ Seq("pip", "install", "-I", + s"${name.value.replace("-","_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"), + join(codegenDir.value, "package", "python")) + }, + generatePythonDoc := { + installPipPackage.value + val dir = join(codegenDir.value, "src", "python", genPackageNamespace.value) + runCmd(activateCondaEnv.value ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), dir) + runCmd(activateCondaEnv.value ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), dir) + }, + publishPython := { + publishLocal.value + packagePython.value + val fn = s"${name.value.replace("-","_")}-${pythonizedVersion.value}-py2.py3-none-any.whl" + singleUploadToBlob( + join(codegenDir.value, "package", "python", fn).toString, + version.value + "/" + fn, "pip") + }, + testPython := { + installPipPackage.value + testgen.value + runCmd( + activateCondaEnv.value ++ Seq("python", + "-m", + "pytest", + s"--cov=${genPackageNamespace.value}", + "--junitxml=../../../../python-test-results.xml", + "--cov-report=xml", + genTestPackageNamespace.value + ), + new File(codegenDir.value, "test/python/") + ) + }, + targetDir := { + artifactPath.in(packageBin).in(Compile).value.getParentFile + }, + codegenDir := { + join(targetDir.value, "generated") + }, + genPackageNamespace := { + "mmlspark" + }, + genTestPackageNamespace := { + "mmlspark-test" + } + + ) +} \ No newline at end of file diff --git a/project/CondaPlugin.scala b/project/CondaPlugin.scala new file mode 100644 index 00000000000..66cc7c10e4f --- /dev/null +++ b/project/CondaPlugin.scala @@ -0,0 +1,56 @@ +import BuildUtils.{osPrefix, runCmd} +import sbt._ +import Keys._ + +import scala.sys.process.Process + +//noinspection ScalaStyle +object CondaPlugin extends AutoPlugin { + override def trigger = allRequirements + + object autoImport { + val condaEnvName = settingKey[String]("Name of conda environment") + val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") + val condaEnvLocation = TaskKey[File]("condaEnvLocation", "get install location of conda env") + val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") + val activateCondaEnv = settingKey[Seq[String]]("commands to activate conda environment") + } + + import autoImport._ + override lazy val globalSettings: Seq[Setting[_]] = Seq( + condaEnvName := "mmlspark", + cleanCondaEnvTask := { + runCmd(Seq("conda", "env", "remove", "--name", condaEnvName.value, "-y")) + }, + condaEnvLocation := { + createCondaEnvTask.value + new File(Process("conda env list").lineStream.toList + .map(_.split("\\s+")) + .map(l => (l.head, l.reverse.head)) + .filter(p => p._1 == condaEnvName.value) + .head._2) + }, + createCondaEnvTask := { + val hasEnv = Process("conda env list").lineStream.toList + .map(_.split("\\s+").head).contains(condaEnvName.value) + if (!hasEnv) { + runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) + } else { + println("Found conda env " + condaEnvName) + } + }, + activateCondaEnv := { + if (sys.props("os.name").toLowerCase.contains("windows")) { + osPrefix ++ Seq("activate", condaEnvName.value, "&&") + } else { + Seq() + //TODO figure out why this doesent work + //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") + } + } + ) + + override def requires: Plugins = sbt.Plugins.empty + + override lazy val projectSettings: Seq[Setting[_]] = Seq() +} \ No newline at end of file diff --git a/project/build.scala b/project/build.scala index f7816cd5d48..06a930e33d1 100644 --- a/project/build.scala +++ b/project/build.scala @@ -2,8 +2,12 @@ import java.io.File import java.lang.ProcessBuilder.Redirect object BuildUtils { + def join(root: File, folders: String*): File = { + folders.foldLeft(root) { case (f, s) => new File(f, s) } + } + def join(folders: String*): File = { - folders.tail.foldLeft(new File(folders.head)) { case (f, s) => new File(f, s) } + join(new File(folders.head), folders.tail: _*) } def isWindows: Boolean = { @@ -27,7 +31,7 @@ object BuildUtils { .redirectError(Redirect.INHERIT) .redirectOutput(Redirect.INHERIT) val env = pb.environment() - envVars.foreach(p =>env.put(p._1,p._2)) + envVars.foreach(p => env.put(p._1, p._2)) assert(pb.start().waitFor() == 0) } @@ -56,6 +60,7 @@ object BuildUtils { "--account-key", Secrets.storageKey) runCmd(osPrefix ++ command) } + def singleUploadToBlob(source: String, dest: String, container: String, @@ -76,6 +81,7 @@ object BuildUtils { val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory) (if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop) } + loop(dir) } @@ -91,7 +97,9 @@ object BuildUtils { zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/"))) val in = new BufferedInputStream(new FileInputStream(file), bufferSize) var b = 0 - while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) } + while (b >= 0) { + zip.write(data, 0, b); b = in.read(data, 0, bufferSize) + } in.close() zip.closeEntry() } diff --git a/project/plugins.sbt b/project/plugins.sbt index cc082cf59b0..6f4bd427f23 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -4,4 +4,4 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8") addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") \ No newline at end of file +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") diff --git a/src/main/python/setup.py b/src/main/python/setup.py deleted file mode 100644 index 3ba8474be22..00000000000 --- a/src/main/python/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -import os -from setuptools import setup, find_packages -import codecs -import os.path - - -def read(rel_path): - here = os.path.abspath(os.path.dirname(__file__)) - with codecs.open(os.path.join(here, rel_path), "r") as fp: - return fp.read() - - -def get_version(rel_path): - for line in read(rel_path).splitlines(): - if line.startswith("__version__"): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - return "0.0.0" - - -setup( - name="mmlspark", - version=get_version("mmlspark/__init__.py"), - description="Microsoft ML for Spark", - long_description="Microsoft ML for Apache Spark contains Microsoft's open source " - + "contributions to the Apache Spark ecosystem", - license="MIT", - packages=find_packages(), - url="https://github.com/Azure/mmlspark", - author="Microsoft", - author_email="mmlspark-support@microsoft.com", - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "Intended Audience :: Data Scientists", - "Topic :: Software Development :: Datascience Tools", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 3", - ], - zip_safe=True, - package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, -) diff --git a/src/test/python/mmlsparktest/nn/__init__.py b/src/test/python/mmlsparktest/nn/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/python/mmlsparktest/recommendation/__init__.py b/src/test/python/mmlsparktest/recommendation/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/python/mmlsparktest/vw/__init__.py b/src/test/python/mmlsparktest/vw/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala b/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala deleted file mode 100644 index 4981013301c..00000000000 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.cntk - -import java.io.File - -import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.image.UnrollImage -import org.apache.spark.ml.linalg.DenseVector -import org.apache.spark.sql._ -import com.microsoft.ml.spark.io.IOImplicits._ - -trait CNTKTestUtils extends TestBase { - - val filesRoot = BuildInfo.datasetDir.toString - val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString - val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString - val inputCol = "cntk_images" - val outputCol = "out" - val labelCol = "labels" - - val featureVectorLength = 3 * 32 * 32 - lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString - - def testModelDF(spark: SparkSession): DataFrame = { - import spark.implicits._ - spark.sparkContext.parallelize(Seq( - Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, - -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), - Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, - -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), - Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, - 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), - Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, - -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), - Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, - 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), - Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, - 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF - } - - def testImages(spark: SparkSession): DataFrame = { - val images = spark.read.image.load(imagePath) - - val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) - - unroll.transform(images).select(inputCol) - } - - def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { - import spark.implicits._ - if (outputDouble) { - List - .fill(rows)(List.fill(size)(0.0).toArray) - .zip(List.fill(rows)(0.0)) - .toDF(inputCol, labelCol) - } else { - List - .fill(rows)(List.fill(size)(0.0.toFloat).toArray) - .zip(List.fill(rows)(0.0)) - .toDF(inputCol, labelCol) - } - } - - protected def compareToTestModel(result: DataFrame) = { - //TODO improve checks - assert(result.columns.toSet == Set(inputCol, outputCol)) - assert(result.count() == testModelDF(result.sparkSession).count()) - val max = result - .select(outputCol) - .collect() - .map(row => row.getAs[DenseVector](0).toArray.max) - .max - assert(max < 10 & max > -10) - } - -} diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py similarity index 97% rename from src/main/python/mmlspark/vw/VowpalWabbitClassifier.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py index ba9d72dc1ee..ac33082148c 100644 --- a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py +++ b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py @@ -1,14 +1,14 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier -from pyspark.ml.common import inherit_doc - -@inherit_doc -class VowpalWabbitClassifier(_VowpalWabbitClassifier): - - def setInitialModel(self, model): - """ - Initialize the estimator with a previously trained model. - """ - self._java_obj.setInitialModel(model._java_obj.getModel()) +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier +from pyspark.ml.common import inherit_doc + +@inherit_doc +class VowpalWabbitClassifier(_VowpalWabbitClassifier): + + def setInitialModel(self, model): + """ + Initialize the estimator with a previously trained model. + """ + self._java_obj.setInitialModel(model._java_obj.getModel()) diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitRegressor.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py diff --git a/src/test/python/mmlsparktest/cyber/feature/__init__.py b/vw/src/main/python/mmlspark/vw/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/__init__.py rename to vw/src/main/python/mmlspark/vw/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala index 401daeadd24..59c983aac1b 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala @@ -9,7 +9,7 @@ import com.microsoft.ml.spark.codegen.Wrappable import com.microsoft.ml.spark.core.contracts.HasWeightCol import com.microsoft.ml.spark.core.env.StreamUtilities import com.microsoft.ml.spark.core.utils.{ClusterUtil, StopWatch} -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.spark.TaskContext import org.apache.spark.internal._ import org.apache.spark.ml.param._ diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala similarity index 94% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala index 46b85505e73..d0208829915 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala @@ -4,15 +4,13 @@ package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.StreamUtilities -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.spark.binary.BinaryFileFormat -import org.apache.spark.ml.ComplexParamsWritable -import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.ml.param.{ByteArrayParam, DataFrameParam, Param} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.sql.types.StructType -import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitMurmur, VowpalWabbitNative} +import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitNative} import org.vowpalwabbit.spark.prediction.ScalarPrediction import scala.io.Source diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala index 75dd1d651ae..7ae43e536d0 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala @@ -1,52 +1,52 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row -import org.vowpalwabbit.spark.VowpalWabbitMurmur - -import scala.collection.mutable - -/** - * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored). - * @param fieldIdx input field index. - * @param columnName used as feature name. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class BooleanFeaturizer(override val fieldIdx: Int, - override val columnName: String, - namespaceHash: Int, mask: Int) - extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] { - - /** - * Pre-hashed feature index. - */ - val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - - featurize(0, row.getBoolean(fieldIdx), indices, values) - } - - def featurize(idx: Int, - value: Boolean, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - if (value) { - indices += featureIdx + idx - values += 1.0 - } - } -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row +import org.vowpalwabbit.spark.VowpalWabbitMurmur + +import scala.collection.mutable + +/** + * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored). + * @param fieldIdx input field index. + * @param columnName used as feature name. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class BooleanFeaturizer(override val fieldIdx: Int, + override val columnName: String, + namespaceHash: Int, mask: Int) + extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] { + + /** + * Pre-hashed feature index. + */ + val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + + featurize(0, row.getBoolean(fieldIdx), indices, values) + } + + def featurize(idx: Int, + value: Boolean, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + if (value) { + indices += featureIdx + idx + values += 1.0 + } + } +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala index a8d6bf1353e..deceb8ddd7a 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala @@ -1,29 +1,29 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix -import org.apache.spark.sql.Row - -import scala.collection.mutable - -private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable { - - val columnName: String - - /** - * Initialize hasher that already pre-hashes the column prefix. - */ - protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName) - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix +import org.apache.spark.sql.Row + +import scala.collection.mutable + +private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable { + + val columnName: String + + /** + * Initialize hasher that already pre-hashes the column prefix. + */ + protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName) + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala index c7ade02c07c..cc56a1081b3 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala @@ -1,61 +1,61 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row -import org.vowpalwabbit.spark.VowpalWabbitMurmur - -import scala.collection.mutable - -/** - * Featurize numeric values into native VW structure. ((hash(column name):value) - * @param fieldIdx input field index. - * @param columnName used as feature name prefix. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int, - override val columnName: String, - val namespaceHash: Int, - val mask: Int, - val zero: Numeric[T]) - extends Featurizer(fieldIdx) with ElementFeaturizer[T] { - - /** - * Pre-hashed feature index. - */ - val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) - - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - featurize(0, row.getAs[T](fieldIdx), indices, values) - } - - def featurize(idx: Int, - value: T, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - // Note: 0 valued features are always filtered. - if (value != zero.zero) { - indices += featureIdx + idx - // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double. - values += zero.toDouble(value) - } - () - } -} - -class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int, - override val columnName: String, - override val namespaceHash: Int, - override val mask: Int, - override val zero: Numeric[T]) - extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) { - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = - if (!row.isNullAt(fieldIdx)) - super.featurize(row, indices, values) -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row +import org.vowpalwabbit.spark.VowpalWabbitMurmur + +import scala.collection.mutable + +/** + * Featurize numeric values into native VW structure. ((hash(column name):value) + * @param fieldIdx input field index. + * @param columnName used as feature name prefix. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int, + override val columnName: String, + val namespaceHash: Int, + val mask: Int, + val zero: Numeric[T]) + extends Featurizer(fieldIdx) with ElementFeaturizer[T] { + + /** + * Pre-hashed feature index. + */ + val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) + + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + featurize(0, row.getAs[T](fieldIdx), indices, values) + } + + def featurize(idx: Int, + value: T, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + // Note: 0 valued features are always filtered. + if (value != zero.zero) { + indices += featureIdx + idx + // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double. + values += zero.toDouble(value) + } + () + } +} + +class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int, + override val columnName: String, + override val namespaceHash: Int, + override val mask: Int, + override val zero: Numeric[T]) + extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) { + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = + if (!row.isNullAt(fieldIdx)) + super.featurize(row, indices, values) +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala index 804f6b482f2..d5821415228 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala @@ -1,47 +1,47 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row - -import scala.collection.mutable - -/** - * Featurize string into native VW structure. (hash(column name + value):1) - * @param fieldIdx input field index. - * @param columnName used as feature name prefix. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class StringFeaturizer(override val fieldIdx: Int, - override val columnName: String, - val namespaceHash: Int, - val mask: Int) - extends Featurizer(fieldIdx) with ElementFeaturizer[String] { - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala-esce, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { - featurize(0, row.getString(fieldIdx), indices, values) - - () - } - - def featurize(idx: Int, - value: String, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - - if (value != null && !value.isEmpty) { - indices += mask & hasher.hash(value, namespaceHash) - values += 1.0 - } - } -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row + +import scala.collection.mutable + +/** + * Featurize string into native VW structure. (hash(column name + value):1) + * @param fieldIdx input field index. + * @param columnName used as feature name prefix. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class StringFeaturizer(override val fieldIdx: Int, + override val columnName: String, + val namespaceHash: Int, + val mask: Int) + extends Featurizer(fieldIdx) with ElementFeaturizer[String] { + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala-esce, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { + featurize(0, row.getString(fieldIdx), indices, values) + + () + } + + def featurize(idx: Int, + value: String, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + + if (value != null && !value.isEmpty) { + indices += mask & hasher.hash(value, namespaceHash) + values += 1.0 + } + } +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala diff --git a/src/test/python/mmlsparktest/cyber/utils/__init__.py b/vw/src/test/python/mmlsparktest/vw/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/utils/__init__.py rename to vw/src/test/python/mmlsparktest/vw/__init__.py diff --git a/src/test/python/mmlsparktest/vw/test_vw.py b/vw/src/test/python/mmlsparktest/vw/test_vw.py similarity index 100% rename from src/test/python/mmlsparktest/vw/test_vw.py rename to vw/src/test/python/mmlsparktest/vw/test_vw.py diff --git a/src/test/python/mmlsparktest/vw/test_vw_cb.py b/vw/src/test/python/mmlsparktest/vw/test_vw_cb.py similarity index 100% rename from src/test/python/mmlsparktest/vw/test_vw_cb.py rename to vw/src/test/python/mmlsparktest/vw/test_vw_cb.py diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala