diff --git a/.gitignore b/.gitignore index 8fd3247cec1..ec1a9ac15e9 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,7 @@ node_modules/ .Rproj.user # R output -*.Rout \ No newline at end of file +*.Rout + +# Misc +.bsp diff --git a/build.sbt b/build.sbt index 0d10df561f6..001a2e02916 100644 --- a/build.sbt +++ b/build.sbt @@ -2,21 +2,16 @@ import java.io.{File, PrintWriter} import java.net.URL import org.apache.commons.io.FileUtils import sbt.ExclusionRule -import sbt.internal.util.ManagedLogger - import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} import scala.xml.transform.{RewriteRule, RuleTransformer} -import scala.sys.process.Process import BuildUtils._ +import CodegenPlugin.autoImport.publishPython +import xerial.sbt.Sonatype._ -val condaEnvName = "mmlspark" -name := "mmlspark" -organization := "com.microsoft.ml.spark" -scalaVersion := "2.12.10" +ThisBuild / organization := "com.microsoft.ml.spark" +ThisBuild / scalaVersion := "2.12.10" val sparkVersion = "3.0.1" -//val scalaMajorVersion = settingKey[String]("scalaMajorVersion") -//scalaMajorVersion := {scalaVersion.value.split(".".toCharArray).dropRight(0).mkString(".")} val scalaMajorVersion = 2.12 val excludes = Seq( @@ -24,42 +19,28 @@ val excludes = Seq( ExclusionRule("org.scalatest") ) -libraryDependencies ++= Seq( +val coreDependencies = Seq( "org.apache.spark" %% "spark-core" % sparkVersion % "compile", "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile", "org.apache.spark" %% "spark-avro" % sparkVersion % "provided", "org.apache.spark" %% "spark-tags" % sparkVersion % "test", "org.scalatest" %% "scalatest" % "3.0.5" % "test") - -libraryDependencies ++= Seq( +val extraDependencies = Seq( "org.scalactic" %% "scalactic" % "3.0.5", "io.spray" %% "spray-json" % "1.3.2", - "com.microsoft.cntk" % "cntk" % "2.4", - "org.openpnp" % "opencv" % "3.2.0-1", "com.jcraft" % "jsch" % "0.1.54", - "com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110", - "com.github.vowpalwabbit" % "vw-jni" % "8.9.1", "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1", ).map(d => d excludeAll (excludes: _*)) +val dependencies = coreDependencies ++ extraDependencies def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\"" -def activateCondaEnv: Seq[String] = { - if (sys.props("os.name").toLowerCase.contains("windows")) { - osPrefix ++ Seq("activate", condaEnvName, "&&") - } else { - Seq() - //TODO figure out why this doesent work - //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") - } -} - val omittedDeps = Set(s"spark-core_${scalaMajorVersion}", s"spark-mllib_${scalaMajorVersion}", "org.scala-lang") // skip dependency elements with a scope -pomPostProcess := { (node: XmlNode) => + +def pomPostFunc(node: XmlNode): scala.xml.Node = { new RuleTransformer(new RewriteRule { override def transform(node: XmlNode): XmlNodeSeq = node match { case e: Elem if e.label == "dependency" @@ -77,191 +58,17 @@ pomPostProcess := { (node: XmlNode) => }).transform(node).head } -resolvers += "Speech" at "https://mmlspark.blob.core.windows.net/maven/" - -val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") -createCondaEnvTask := { - val s = streams.value - val hasEnv = Process("conda env list").lineStream.toList - .map(_.split("\\s+").head).contains(condaEnvName) - if (!hasEnv) { - runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) - } else { - println("Found conda env " + condaEnvName) - } -} - -val condaEnvLocation = TaskKey[String]("condaEnvLocation", "get install location of conda env") -condaEnvLocation := { - val s = streams.value - createCondaEnvTask.value - Process("conda env list").lineStream.toList - .map(_.split("\\s+")) - .map(l => (l.head, l.reverse.head)) - .filter(p => p._1 == condaEnvName) - .head._2 -} - - -val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") -cleanCondaEnvTask := { - runCmd(Seq("conda", "env", "remove", "--name", condaEnvName, "-y")) -} - -val codegenTask = TaskKey[Unit]("codegen", "Generate Code") -codegenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value -} - -val testgenTask = TaskKey[Unit]("testgen", "Generate Tests") -testgenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.TestGen").value -} - -val genDir = join("target", s"scala-${scalaMajorVersion}", "generated") -val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") -val pythonSrcDir = join(genDir.toString, "src", "python") -val unifiedDocDir = join(genDir.toString, "doc") -val pythonDocDir = join(unifiedDocDir.toString, "pyspark") -val pythonPackageDir = join(genDir.toString, "package", "python") -val pythonTestDir = join(genDir.toString, "test", "python") -val rSrcDir = join(genDir.toString, "src", "R", "mmlspark") -val rPackageDir = join(genDir.toString, "package", "R") - -val pythonizedVersion = settingKey[String]("Pythonized version") -pythonizedVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head + ".dev1" - } else { - version.value - } -} - -val rVersion = settingKey[String]("R version") -rVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head - } else { - version.value - } -} - -def rCmd(cmd: Seq[String], wd: File, libPath: String): Unit = { - runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) -} - -val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") -packageR := { - createCondaEnvTask.value - codegenTask.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) - rPackageDir.mkdirs() - zipFolder(rSrcDir, new File(rPackageDir, s"mmlspark-${version.value}.zip")) -} - -val testR = TaskKey[Unit]("testR", "Run testthat on R tests") -testR := { - packageR.value - publishLocal.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", "mmlspark"), rSrcDir.getParentFile, libPath) - val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath - rCmd(Seq("Rscript", testRunner), rSrcDir, libPath) -} - -val publishR = TaskKey[Unit]("publishR", "publish R package to blob") -publishR := { - codegenTask.value - packageR.value - val rPackage = rPackageDir.listFiles().head - singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") -} - -val packagePythonTask = TaskKey[Unit]("packagePython", "Package python sdk") -packagePythonTask := { - codegenTask.value - createCondaEnvTask.value - val destPyDir = join("target", s"scala-${scalaMajorVersion}", "classes", "mmlspark") - if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) - FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir) - runCmd( - activateCondaEnv ++ - Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", s"${pythonPackageDir.absolutePath}"), - pythonSrcDir) -} - -val installPipPackageTask = TaskKey[Unit]("installPipPackage", "install python sdk") -installPipPackageTask := { - packagePythonTask.value - publishLocal.value - runCmd( - activateCondaEnv ++ Seq("pip", "install", "-I", - s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl"), - pythonPackageDir) -} - -val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") -generatePythonDoc := { - installPipPackageTask.value - runCmd(activateCondaEnv ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), - join(pythonSrcDir.toString, "mmlspark")) - runCmd(activateCondaEnv ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), - join(pythonSrcDir.toString, "mmlspark")) -} - -val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") -publishDocs := { - generatePythonDoc.value - (Compile / unidoc).value - val html = - """ - |
- |pyspark/ - |scala/ - |- """.stripMargin - val scalaDir = join(unifiedDocDir.toString, "scala") - if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) - FileUtils.copyDirectory(unidocDir, scalaDir) - FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") - uploadToBlob(unifiedDocDir.toString, version.value, "docs") -} - -val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") -publishPython := { - publishLocal.value - packagePythonTask.value - singleUploadToBlob( - join(pythonPackageDir.toString, s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl").toString, - version.value + s"/mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl", - "pip") -} +pomPostProcess := pomPostFunc -val testPythonTask = TaskKey[Unit]("testPython", "test python sdk") - -testPythonTask := { - installPipPackageTask.value - testgenTask.value - runCmd( - activateCondaEnv ++ Seq("python", - "-m", - "pytest", - "--cov=mmlspark", - "--junitxml=../../../../python-test-results.xml", - "--cov-report=xml", - "mmlsparktest" - ), - new File(s"target/scala-${scalaMajorVersion}/generated/test/python/") - ) -} +val speechResolver = "Speech" at "https://mmlspark.blob.core.windows.net/maven/" val getDatasetsTask = TaskKey[Unit]("getDatasets", "download datasets used for testing") val datasetName = "datasets-2020-08-27.tgz" val datasetUrl = new URL(s"https://mmlspark.blob.core.windows.net/installers/$datasetName") val datasetDir = settingKey[File]("The directory that holds the dataset") -datasetDir := { - join(target.value.toString, s"scala-${scalaMajorVersion}", "datasets", datasetName.split(".".toCharArray.head).head) +ThisBuild / datasetDir := { + join(artifactPath.in(packageBin).in(Compile).value.getParentFile, + "datasets", datasetName.split(".".toCharArray.head).head) } getDatasetsTask := { @@ -276,49 +83,40 @@ getDatasetsTask := { val genBuildInfo = TaskKey[Unit]("genBuildInfo", "generate a build info file") genBuildInfo := { - val buildInfo = + val docInfo = s""" - |MMLSpark Build and Release Information - |--------------- - | - |### Maven Coordinates - | `${organization.value}:${name.value}_${scalaMajorVersion}:${version.value}` - | - |### Maven Resolver - | `https://mmlspark.azureedge.net/maven` | |### Documentation Pages: |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) | """.stripMargin + val buildInfo = (root / blobArtifactInfo).value + docInfo val infoFile = join("target", "Build.md") if (infoFile.exists()) FileUtils.forceDelete(infoFile) FileUtils.writeStringToFile(infoFile, buildInfo, "utf-8") } -val setupTask = TaskKey[Unit]("setup", "set up library for intellij") -setupTask := { - (Compile / compile).toTask.value - (Test / compile).toTask.value - getDatasetsTask.value +val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") +val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") +publishDocs := { + (root / generatePythonDoc).value + (root / Compile / unidoc).value + val html = + """ + |
+ |pyspark/ + |scala/ + |+ """.stripMargin + val unifiedDocDir = join((root / codegenDir).value, "doc") + val scalaDir = join(unifiedDocDir.toString, "scala") + if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) + FileUtils.copyDirectory(unidocDir, scalaDir) + FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") + uploadToBlob(unifiedDocDir.toString, version.value, "docs") } -val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") -publishBlob := { - publishM2.value - val scalaVersionSuffix = scalaVersion.value.split(".".toCharArray.head).dropRight(1).mkString(".") - val nameAndScalaVersion = s"${name.value}_$scalaVersionSuffix" - - val localPackageFolder = join( - Seq(new File(new URI(Resolver.mavenLocal.root)).getAbsolutePath) - ++ organization.value.split(".".toCharArray.head) - ++ Seq(nameAndScalaVersion, version.value): _*).toString - - val blobMavenFolder = organization.value.replace(".", "/") + - s"/$nameAndScalaVersion/${version.value}" - uploadToBlob(localPackageFolder, blobMavenFolder, "maven") -} val release = TaskKey[Unit]("release", "publish the library to mmlspark blob") release := Def.taskDyn { @@ -355,11 +153,8 @@ publishBadges := { } val settings = Seq( - (scalastyleConfig in Test) := baseDirectory.value / "scalastyle-test-config.xml", + (scalastyleConfig in Test) := (ThisBuild / baseDirectory).value / "scalastyle-test-config.xml", logBuffered in Test := false, - buildInfoKeys := Seq[BuildInfoKey]( - name, version, scalaVersion, sbtVersion, - baseDirectory, datasetDir, pythonizedVersion, rVersion), parallelExecution in Test := false, test in assembly := {}, assemblyMergeStrategy in assembly := { @@ -367,14 +162,88 @@ val settings = Seq( case x => MergeStrategy.first }, assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false), - buildInfoPackage := "com.microsoft.ml.spark.build") - -lazy val mmlspark = (project in file(".")) - .enablePlugins(BuildInfoPlugin) - .enablePlugins(ScalaUnidocPlugin) - .settings(settings: _*) + buildInfoPackage := "com.microsoft.ml.spark.build", + autoAPIMappings := true, + publishMavenStyle := true, + pomPostProcess := pomPostFunc +) +ThisBuild / publishMavenStyle := true + +lazy val core = (project in file("core")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .settings((settings ++ Seq( + libraryDependencies := dependencies, + buildInfoKeys += datasetDir, + name := "mmlspark-core" + )): _*) + +lazy val deepLearning = (project in file("deep-learning")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.microsoft.cntk" % "cntk" % "2.4"), + name := "mmlspark-deep-learning" + )): _*) + +lazy val lightgbm = (project in file("lightgbm")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110"), + name := "mmlspark-lightgbm" + )): _*) + +lazy val vw = (project in file("vw")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.github.vowpalwabbit" % "vw-jni" % "8.9.1"), + name := "mmlspark-vw" + )): _*) + +lazy val cognitive = (project in file("cognitive")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0"), + resolvers += speechResolver, + name := "mmlspark-cognitive" + )): _*) + +lazy val opencv = (project in file("opencv")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + buildInfoKeys += datasetDir, + libraryDependencies := Seq("org.openpnp" % "opencv" % "3.2.0-1"), + name := "mmlspark-opencv" + )): _*) + +lazy val root = (project in file(".")) + .aggregate(core, deepLearning, cognitive, vw, lightgbm, opencv) + .dependsOn(core, deepLearning, cognitive, vw, lightgbm, opencv) + .enablePlugins(BuildInfoPlugin && ScalaUnidocPlugin && SbtPlugin) + .settings(settings ++ Seq( + name := "mmlspark", + buildInfoKeys += datasetDir, + genJarName := None, + publishPython / aggregate := false, + testPython / aggregate := false, + publishR / aggregate := false, + installPipPackage / aggregate := false, + )) -import xerial.sbt.Sonatype._ +val setupTask = TaskKey[Unit]("setup", "set up library for intellij") +setupTask := { + compile.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile, Test))).value + getDatasetsTask.value +} sonatypeProjectHosting := Some( GitHubHosting("Azure", "MMLSpark", "mmlspark-support@microsot.com")) @@ -389,7 +258,6 @@ developers := List( ) licenses += ("MIT", url("https://github.com/Azure/mmlspark/blob/master/LICENSE")) -publishMavenStyle := true credentials += Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", @@ -416,6 +284,4 @@ pgpPublicRing := { dynverSonatypeSnapshots in ThisBuild := true dynverSeparator in ThisBuild := "-" -publishTo := sonatypePublishToBundle.value - -// Break Cache - 1 +publishTo := sonatypePublishToBundle.value \ No newline at end of file diff --git a/src/main/python/mmlspark/cognitive/AzureSearchWriter.py b/cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py similarity index 100% rename from src/main/python/mmlspark/cognitive/AzureSearchWriter.py rename to cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py diff --git a/src/main/python/mmlspark/cognitive/BingImageSearch.py b/cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py similarity index 100% rename from src/main/python/mmlspark/cognitive/BingImageSearch.py rename to cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py diff --git a/src/__init__.py b/cognitive/src/main/python/mmlspark/cognitive/__init__.py similarity index 100% rename from src/__init__.py rename to cognitive/src/main/python/mmlspark/cognitive/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala index 96024a68b63..b405bb13b09 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala @@ -143,7 +143,8 @@ object AzureSearchWriter extends IndexParser with SLogging { val Logger: Logger = LogManager.getRootLogger - private def checkForErrors(fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { + private def checkForErrors( + fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { Option(errorRow).map { r => val message = s"Service Exception:\n\t ${r.toString()} \n for input:\n\t ${inputRow.toString()}" if (fatal) { diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala index 51a965b0d08..45447ac5f2d 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala @@ -8,15 +8,17 @@ import java.lang.ProcessBuilder.Redirect import java.net.{URI, URL} import java.util.UUID import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} + import com.microsoft.cognitiveservices.speech._ import com.microsoft.cognitiveservices.speech.audio._ -import com.microsoft.cognitiveservices.speech.transcription.{Conversation, ConversationTranscriber, - ConversationTranscriptionEventArgs, Participant} +import com.microsoft.cognitiveservices.speech.transcription.{ + Conversation, ConversationTranscriber, ConversationTranscriptionEventArgs, Participant} import com.microsoft.cognitiveservices.speech.util.EventHandler import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.cognitive.SpeechFormat._ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.{DatasetExtensions, SparkBindings} +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.io.http.HasURL import com.microsoft.ml.spark.logging.BasicLogging import com.microsoft.ml.spark.{CompressedStream, WavStream} @@ -36,10 +38,6 @@ import spray.json._ import scala.concurrent.{ExecutionContext, Future, blocking} import scala.language.existentials -object OsUtils { - val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 -} - object SpeechToTextSDK extends ComplexParamsReadable[SpeechToTextSDK] private[ml] class BlockingQueueIterator[T](lbq: LinkedBlockingQueue[Option[T]], diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala diff --git a/src/main/__init__.py b/cognitive/src/test/python/mmlsparktest/cognitive/__init__.py similarity index 100% rename from src/main/__init__.py rename to cognitive/src/test/python/mmlsparktest/cognitive/__init__.py diff --git a/src/test/python/mmlsparktest/cognitive/test_simple.py b/cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/test_simple.py rename to cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala index 11a75834a4f..6255d9462b4 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala @@ -9,12 +9,10 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.NamespaceInjections.pipelineModel import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.functions.{corr, typedLit} +import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalactic.Equality -import org.scalatest.Assertion import com.microsoft.ml.spark.FluentAPI._ -import com.microsoft.ml.spark.featurize.text.PageSplitter trait CognitiveKey { lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey) diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala similarity index 94% rename from src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala index 620ab98aa28..d88d70d63af 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala @@ -1,11 +1,12 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.core.utils +package com.microsoft.ml.spark.core.utils.utils import com.microsoft.ml.spark.cognitive.TextSentiment import com.microsoft.ml.spark.core.env.FileUtilities.join import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.stages.DropColumns class ModelEqualitySuite extends TestBase { diff --git a/src/main/python/LICENSE.txt b/core/src/main/python/LICENSE.txt similarity index 100% rename from src/main/python/LICENSE.txt rename to core/src/main/python/LICENSE.txt diff --git a/src/main/python/MANIFEST.in b/core/src/main/python/MANIFEST.in similarity index 100% rename from src/main/python/MANIFEST.in rename to core/src/main/python/MANIFEST.in diff --git a/src/main/python/__init__.py b/core/src/main/python/__init__.py similarity index 100% rename from src/main/python/__init__.py rename to core/src/main/python/__init__.py diff --git a/src/main/python/mmlspark/README.txt b/core/src/main/python/mmlspark/README.txt similarity index 100% rename from src/main/python/mmlspark/README.txt rename to core/src/main/python/mmlspark/README.txt diff --git a/src/main/python/mmlspark/__init__.py b/core/src/main/python/mmlspark/__init__.py similarity index 100% rename from src/main/python/mmlspark/__init__.py rename to core/src/main/python/mmlspark/__init__.py diff --git a/src/main/python/mmlspark/automl/BestModel.py b/core/src/main/python/mmlspark/automl/BestModel.py similarity index 100% rename from src/main/python/mmlspark/automl/BestModel.py rename to core/src/main/python/mmlspark/automl/BestModel.py diff --git a/src/main/python/mmlspark/automl/HyperparamBuilder.py b/core/src/main/python/mmlspark/automl/HyperparamBuilder.py similarity index 100% rename from src/main/python/mmlspark/automl/HyperparamBuilder.py rename to core/src/main/python/mmlspark/automl/HyperparamBuilder.py diff --git a/src/main/python/mmlspark/automl/TuneHyperparametersModel.py b/core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py similarity index 100% rename from src/main/python/mmlspark/automl/TuneHyperparametersModel.py rename to core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py diff --git a/src/main/python/mmlspark/automl/__init__.py b/core/src/main/python/mmlspark/automl/__init__.py similarity index 100% rename from src/main/python/mmlspark/automl/__init__.py rename to core/src/main/python/mmlspark/automl/__init__.py diff --git a/src/main/python/mmlspark/cntk/__init__.py b/core/src/main/python/mmlspark/core/__init__.py similarity index 100% rename from src/main/python/mmlspark/cntk/__init__.py rename to core/src/main/python/mmlspark/core/__init__.py diff --git a/src/main/python/mmlspark/core/schema/TypeConversionUtils.py b/core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/TypeConversionUtils.py rename to core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py diff --git a/src/main/python/mmlspark/core/schema/Utils.py b/core/src/main/python/mmlspark/core/schema/Utils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/Utils.py rename to core/src/main/python/mmlspark/core/schema/Utils.py diff --git a/src/main/python/mmlspark/cognitive/__init__.py b/core/src/main/python/mmlspark/core/schema/__init__.py similarity index 100% rename from src/main/python/mmlspark/cognitive/__init__.py rename to core/src/main/python/mmlspark/core/schema/__init__.py diff --git a/src/main/python/mmlspark/core/__init__.py b/core/src/main/python/mmlspark/core/serialize/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/__init__.py rename to core/src/main/python/mmlspark/core/serialize/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/java_params_patch.py b/core/src/main/python/mmlspark/core/serialize/java_params_patch.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/java_params_patch.py rename to core/src/main/python/mmlspark/core/serialize/java_params_patch.py diff --git a/src/main/python/mmlspark/core/spark/FluentAPI.py b/core/src/main/python/mmlspark/core/spark/FluentAPI.py similarity index 100% rename from src/main/python/mmlspark/core/spark/FluentAPI.py rename to core/src/main/python/mmlspark/core/spark/FluentAPI.py diff --git a/src/main/python/mmlspark/core/schema/__init__.py b/core/src/main/python/mmlspark/core/spark/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/schema/__init__.py rename to core/src/main/python/mmlspark/core/spark/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/__init__.py b/core/src/main/python/mmlspark/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/__init__.py rename to core/src/main/python/mmlspark/cyber/__init__.py diff --git a/src/main/python/mmlspark/core/spark/__init__.py b/core/src/main/python/mmlspark/cyber/anomaly/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/spark/__init__.py rename to core/src/main/python/mmlspark/cyber/anomaly/__init__.py diff --git a/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py b/core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py rename to core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py diff --git a/src/main/python/mmlspark/cyber/anomaly/complement_access.py b/core/src/main/python/mmlspark/cyber/anomaly/complement_access.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/complement_access.py rename to core/src/main/python/mmlspark/cyber/anomaly/complement_access.py diff --git a/src/main/python/mmlspark/cyber/dataset.py b/core/src/main/python/mmlspark/cyber/dataset.py similarity index 100% rename from src/main/python/mmlspark/cyber/dataset.py rename to core/src/main/python/mmlspark/cyber/dataset.py diff --git a/src/main/python/mmlspark/cyber/__init__.py b/core/src/main/python/mmlspark/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/__init__.py rename to core/src/main/python/mmlspark/cyber/feature/__init__.py diff --git a/src/main/python/mmlspark/cyber/feature/indexers.py b/core/src/main/python/mmlspark/cyber/feature/indexers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/indexers.py rename to core/src/main/python/mmlspark/cyber/feature/indexers.py diff --git a/src/main/python/mmlspark/cyber/feature/scalers.py b/core/src/main/python/mmlspark/cyber/feature/scalers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/scalers.py rename to core/src/main/python/mmlspark/cyber/feature/scalers.py diff --git a/src/main/python/mmlspark/cyber/anomaly/__init__.py b/core/src/main/python/mmlspark/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/__init__.py rename to core/src/main/python/mmlspark/cyber/utils/__init__.py diff --git a/src/main/python/mmlspark/cyber/utils/spark_utils.py b/core/src/main/python/mmlspark/cyber/utils/spark_utils.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/spark_utils.py rename to core/src/main/python/mmlspark/cyber/utils/spark_utils.py diff --git a/src/main/python/mmlspark/doc/conf.py b/core/src/main/python/mmlspark/doc/conf.py similarity index 100% rename from src/main/python/mmlspark/doc/conf.py rename to core/src/main/python/mmlspark/doc/conf.py diff --git a/src/main/python/mmlspark/doc/index.rst b/core/src/main/python/mmlspark/doc/index.rst similarity index 100% rename from src/main/python/mmlspark/doc/index.rst rename to core/src/main/python/mmlspark/doc/index.rst diff --git a/src/main/python/mmlspark/doc/scala.rst b/core/src/main/python/mmlspark/doc/scala.rst similarity index 100% rename from src/main/python/mmlspark/doc/scala.rst rename to core/src/main/python/mmlspark/doc/scala.rst diff --git a/src/main/python/mmlspark/downloader/ModelDownloader.py b/core/src/main/python/mmlspark/downloader/ModelDownloader.py similarity index 100% rename from src/main/python/mmlspark/downloader/ModelDownloader.py rename to core/src/main/python/mmlspark/downloader/ModelDownloader.py diff --git a/src/main/python/mmlspark/cyber/feature/__init__.py b/core/src/main/python/mmlspark/downloader/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/__init__.py rename to core/src/main/python/mmlspark/downloader/__init__.py diff --git a/src/main/python/mmlspark/io/IOImplicits.py b/core/src/main/python/mmlspark/io/IOImplicits.py similarity index 100% rename from src/main/python/mmlspark/io/IOImplicits.py rename to core/src/main/python/mmlspark/io/IOImplicits.py diff --git a/src/main/python/mmlspark/cyber/utils/__init__.py b/core/src/main/python/mmlspark/io/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/__init__.py rename to core/src/main/python/mmlspark/io/__init__.py diff --git a/src/main/python/mmlspark/io/binary/BinaryFileReader.py b/core/src/main/python/mmlspark/io/binary/BinaryFileReader.py similarity index 100% rename from src/main/python/mmlspark/io/binary/BinaryFileReader.py rename to core/src/main/python/mmlspark/io/binary/BinaryFileReader.py diff --git a/src/main/python/mmlspark/downloader/__init__.py b/core/src/main/python/mmlspark/io/binary/__init__.py similarity index 100% rename from src/main/python/mmlspark/downloader/__init__.py rename to core/src/main/python/mmlspark/io/binary/__init__.py diff --git a/src/main/python/mmlspark/io/http/HTTPFunctions.py b/core/src/main/python/mmlspark/io/http/HTTPFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/HTTPFunctions.py rename to core/src/main/python/mmlspark/io/http/HTTPFunctions.py diff --git a/src/main/python/mmlspark/io/http/JSONOutputParser.py b/core/src/main/python/mmlspark/io/http/JSONOutputParser.py similarity index 100% rename from src/main/python/mmlspark/io/http/JSONOutputParser.py rename to core/src/main/python/mmlspark/io/http/JSONOutputParser.py diff --git a/src/main/python/mmlspark/io/http/ServingFunctions.py b/core/src/main/python/mmlspark/io/http/ServingFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/ServingFunctions.py rename to core/src/main/python/mmlspark/io/http/ServingFunctions.py diff --git a/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py b/core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py similarity index 100% rename from src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py rename to core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py diff --git a/src/main/python/mmlspark/image/__init__.py b/core/src/main/python/mmlspark/io/http/__init__.py similarity index 100% rename from src/main/python/mmlspark/image/__init__.py rename to core/src/main/python/mmlspark/io/http/__init__.py diff --git a/src/main/python/mmlspark/io/image/ImageUtils.py b/core/src/main/python/mmlspark/io/image/ImageUtils.py similarity index 100% rename from src/main/python/mmlspark/io/image/ImageUtils.py rename to core/src/main/python/mmlspark/io/image/ImageUtils.py diff --git a/src/main/python/mmlspark/io/__init__.py b/core/src/main/python/mmlspark/io/image/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/__init__.py rename to core/src/main/python/mmlspark/io/image/__init__.py diff --git a/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py b/core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/PowerBIWriter.py rename to core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py diff --git a/src/main/python/mmlspark/io/binary/__init__.py b/core/src/main/python/mmlspark/io/powerbi/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/binary/__init__.py rename to core/src/main/python/mmlspark/io/powerbi/__init__.py diff --git a/src/main/python/mmlspark/nn/ConditionalBallTree.py b/core/src/main/python/mmlspark/nn/ConditionalBallTree.py similarity index 100% rename from src/main/python/mmlspark/nn/ConditionalBallTree.py rename to core/src/main/python/mmlspark/nn/ConditionalBallTree.py diff --git a/src/main/python/mmlspark/io/http/__init__.py b/core/src/main/python/mmlspark/nn/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/http/__init__.py rename to core/src/main/python/mmlspark/nn/__init__.py diff --git a/src/main/python/mmlspark/io/image/__init__.py b/core/src/main/python/mmlspark/plot/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/image/__init__.py rename to core/src/main/python/mmlspark/plot/__init__.py diff --git a/src/main/python/mmlspark/plot/plot.py b/core/src/main/python/mmlspark/plot/plot.py similarity index 100% rename from src/main/python/mmlspark/plot/plot.py rename to core/src/main/python/mmlspark/plot/plot.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py diff --git a/src/main/python/mmlspark/recommendation/SARModel.py b/core/src/main/python/mmlspark/recommendation/SARModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/SARModel.py rename to core/src/main/python/mmlspark/recommendation/SARModel.py diff --git a/src/main/python/mmlspark/recommendation/__init__.py b/core/src/main/python/mmlspark/recommendation/__init__.py similarity index 100% rename from src/main/python/mmlspark/recommendation/__init__.py rename to core/src/main/python/mmlspark/recommendation/__init__.py diff --git a/src/main/python/mmlspark/stages/UDFTransformer.py b/core/src/main/python/mmlspark/stages/UDFTransformer.py similarity index 100% rename from src/main/python/mmlspark/stages/UDFTransformer.py rename to core/src/main/python/mmlspark/stages/UDFTransformer.py diff --git a/src/main/python/mmlspark/io/powerbi/__init__.py b/core/src/main/python/mmlspark/stages/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/__init__.py rename to core/src/main/python/mmlspark/stages/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt diff --git a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala similarity index 53% rename from src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala index 67d667e339e..f573880d07f 100644 --- a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala @@ -4,14 +4,13 @@ package com.microsoft.ml.spark.codegen import java.io.File + import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.codegen.Config._ import com.microsoft.ml.spark.core.env.FileUtilities._ -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing -import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices import org.apache.commons.io.FileUtils import org.apache.commons.io.FilenameUtils._ +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices object CodeGenUtils { def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) @@ -23,14 +22,18 @@ object CodeGen { import CodeGenUtils._ - def generatePythonClasses(): Unit = { - instantiateServices[PythonWrappable].foreach { w => + def generatePythonClasses(jarName: Option[String]): Unit = { + val instantiatedClasses = instantiateServices[PythonWrappable](jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) w.makePyFile() } } - def generateRClasses(): Unit = { - instantiateServices[RWrappable].foreach { w => + def generateRClasses(jarName: Option[String]): Unit = { + val instantiatedClasses = instantiateServices[RWrappable](jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) w.makeRFile() } } @@ -57,7 +60,7 @@ object CodeGen { RSrcDir.mkdirs() writeFile(new File(RSrcDir.getParentFile, "DESCRIPTION"), - s"""|Package: mmlspark + s"""|Package: ${BuildInfo.name} |Title: Access to MMLSpark via R |Description: Provides an interface to MMLSpark. |Version: ${BuildInfo.rVersion} @@ -82,7 +85,7 @@ object CodeGen { | spark_dependency( | jars = c(), | packages = c( - | sprintf("com.microsoft.ml.spark:mmlspark_%s:${BuildInfo.version}", scala_version) + | sprintf("com.microsoft.ml.spark:${BuildInfo.name}_%s:${BuildInfo.version}", scala_version) | ), | repositories = c("https://mmlspark.azureedge.net/maven") | ) @@ -118,59 +121,90 @@ object CodeGen { } - def rGen(): Unit = { + //noinspection ScalaStyle + def generatePyPackageData(): Unit = { + if (!PySrcDir.exists()){ + PySrcDir.mkdir() + } + writeFile(join(PySrcDir, "setup.py"), + s""" + |# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + | + |import os + |from setuptools import setup, find_packages + |import codecs + |import os.path + | + | + |def read(rel_path): + | here = os.path.abspath(os.path.dirname(__file__)) + | with codecs.open(os.path.join(here, rel_path), "r") as fp: + | return fp.read() + | + | + |def get_version(rel_path): + | for line in read(rel_path).splitlines(): + | if line.startswith("__version__"): + | delim = '"' if '"' in line else "'" + | return line.split(delim)[1] + | return "0.0.0" + | + | + |setup( + | name="${BuildInfo.name}", + | version=get_version("mmlspark/__init__.py"), + | description="Microsoft ML for Spark", + | long_description="Microsoft ML for Apache Spark contains Microsoft's open source " + | + "contributions to the Apache Spark ecosystem", + | license="MIT", + | packages=find_packages(), + | url="https://github.com/Azure/mmlspark", + | author="Microsoft", + | author_email="mmlspark-support@microsoft.com", + | classifiers=[ + | "Development Status :: 3 - Alpha", + | "Intended Audience :: Developers", + | "Intended Audience :: Data Scientists", + | "Topic :: Software Development :: Datascience Tools", + | "License :: OSI Approved :: MIT License", + | "Programming Language :: Python :: 2", + | "Programming Language :: Python :: 3", + | ], + | zip_safe=True, + | package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, + |) + | + |""".stripMargin) + } + + + def rGen(jarName: Option[String]): Unit = { + println(s"Generating R for ${jarName}") clean(RSrcRoot) generateRPackageData() - generateRClasses() - FileUtils.copyDirectoryToDirectory(toDir(RSrcOverrideDir), toDir(RSrcDir)) - FileUtils.copyDirectoryToDirectory(toDir(RTestOverrideDir), toDir(RTestDir)) + generateRClasses(jarName) + if (RSrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(RSrcOverrideDir), toDir(RSrcDir)) + if (RTestOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(RTestOverrideDir), toDir(RTestDir)) } - def pyGen(): Unit = { + def pyGen(jarName: Option[String]): Unit = { + println(s"Generating python for ${jarName}") clean(PySrcDir) - generatePythonClasses() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir)) + generatePyPackageData() + generatePythonClasses(jarName) + if (PySrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir)) makeInitFiles() } def main(args: Array[String]): Unit = { clean(PackageDir) - rGen() - pyGen() + rGen(args.headOption) + pyGen(args.headOption) } } -object TestGen { - - import CodeGenUtils._ - - def generatePythonTests(): Unit = { - instantiateServices[PyTestFuzzing[_]].foreach { ltc => - try { - ltc.makePyTestFile() - } catch { - case _: NotImplementedError => - println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") - } - } - } - - private def makeInitFiles(packageFolder: String = ""): Unit = { - val dir = new File(new File(PyTestDir, "mmlsparktest"), packageFolder) - writeFile(new File(dir, "__init__.py"), "") - dir.listFiles().filter(_.isDirectory).foreach(f => - makeInitFiles(packageFolder + "/" + f.getName) - ) - } - - def main(args: Array[String]): Unit = { - clean(TestDataDir) - clean(PyTestDir) - generatePythonTests() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir)) - makeInitFiles() - } -} diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala similarity index 56% rename from src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala index 03785cbd8c9..c473aec0d0c 100644 --- a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala @@ -8,15 +8,14 @@ import java.io.File import com.microsoft.ml.spark.build.BuildInfo object Config { - val DebugMode = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" + val DebugMode: Boolean = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" - val TopDir = BuildInfo.baseDirectory - val Version = BuildInfo.version - val PackageName = BuildInfo.name - val TargetDir = new File(TopDir, s"target/scala-${BuildInfo.scalaVersion.slice(0,4)}") + val TopDir: File = BuildInfo.baseDirectory + val Version: String = BuildInfo.version + val PackageName: String = BuildInfo.name val ScalaSrcDir = "src/main/scala" - val GeneratedDir = new File(TargetDir, "generated") + val GeneratedDir = new File(BuildInfo.targetDir, "generated") val PackageDir = new File(GeneratedDir, "package") val SrcDir = new File(GeneratedDir, "src") val TestDir = new File(GeneratedDir, "test") @@ -42,9 +41,9 @@ object Config { //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip") val InternalPrefix = "_" - val ScopeDepth = " " * 4 + val ScopeDepth: String = " " * 4 - val CopyrightLines = + val CopyrightLines: String = s"""|# Copyright (C) Microsoft Corporation. All rights reserved. |# Licensed under the MIT License. See LICENSE in project root for information. |""".stripMargin @@ -54,13 +53,19 @@ object Config { s"""|$CopyrightLines | |"\"" - |MicrosoftML is a library of Python classes to interface with the - |Microsoft scala APIs to utilize Apache Spark to create distibuted - |machine learning models. + |MMLSpark is an ecosystem of tools aimed towards expanding the distributed computing framework + |Apache Spark in several new directions. MMLSpark adds many deep learning and data science tools to the Spark + |ecosystem, including seamless integration of Spark Machine Learning pipelines with + |Microsoft Cognitive Toolkit (CNTK), LightGBM and OpenCV. These tools enable powerful and + |highly-scalable predictive and analytical models for a variety of datasources. | - |MicrosoftML simplifies training and scoring classifiers and - |regressors, as well as facilitating the creation of models using the - |CNTK library, images, and text. + |MMLSpark also brings new networking capabilities to the Spark Ecosystem. With the HTTP on Spark project, + |users can embed any web service into their SparkML models. In this vein, MMLSpark provides easy to use SparkML + |transformers for a wide variety of Microsoft Cognitive Services. For production grade deployment, + |the Spark Serving project enables high throughput, sub-millisecond latency web services, + |backed by your Spark cluster. + | + |MMLSpark requires Scala 2.11, Spark 2.4+, and Python 3.5+. |"\"" | |__version__ = "${BuildInfo.pythonizedVersion}" diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java b/core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java rename to core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala b/core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala b/core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala index 297dba1de68..db8e39cd033 100644 --- a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala @@ -6,7 +6,7 @@ package com.microsoft.ml.spark.core.utils import java.net.InetAddress import org.apache.http.conn.util.InetAddressUtils -import org.apache.spark.lightgbm.BlockManagerUtils +import org.apache.spark.injections.BlockManagerUtils import org.apache.spark.sql.{Dataset, SparkSession} import org.slf4j.Logger diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala new file mode 100644 index 00000000000..0afa793dcbd --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala @@ -0,0 +1,33 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +import scala.concurrent.duration.Duration +import scala.concurrent.{Await, ExecutionContext, Future} + +object FaultToleranceUtils { + def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ + try { + Await.result(Future(f)(ExecutionContext.global), timeout) + } catch { + case e: Exception if times >= 1 => + print(s"Received exception on call, retrying: $e") + retryWithTimeout(times-1, timeout)(f) + } + } + + val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) + + def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ + try { + f + } catch { + case e: Exception if times.nonEmpty => + println(s"Received exception on call, retrying: $e") + Thread.sleep(times.head) + retryWithTimeout(times.tail)(f) + } + } + +} \ No newline at end of file diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala similarity index 70% rename from src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala index dba98c4a595..3e906d66d7d 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala @@ -1,15 +1,11 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - package com.microsoft.ml.spark.core.utils import java.lang.reflect.Modifier import com.microsoft.ml.spark.codegen.Wrappable import org.sparkproject.guava.reflect.ClassPath - import scala.collection.JavaConverters._ -import scala.reflect.{ClassTag, _} +import scala.reflect.{ClassTag, classTag} /** Contains logic for loading classes. */ object JarLoadingUtils { @@ -41,22 +37,25 @@ object JarLoadingUtils { AllClasses.filter(classOf[Wrappable].isAssignableFrom(_)) } - def instantiateServices[T: ClassTag](instantiate: Class[_] => Any): List[T] = { + def instantiateServices[T: ClassTag](instantiate: Class[_] => Any, jarName: Option[String]): List[T] = { AllClasses .filter(classTag[T].runtimeClass.isAssignableFrom(_)) + .filter(c => jarName.forall(c.getResource(c.getSimpleName + ".class").toString.contains(_))) .filter(clazz => !Modifier.isAbstract(clazz.getModifiers)) .map(instantiate(_)).asInstanceOf[List[T]] } - def instantiateServices[T: ClassTag]: List[T] = instantiateServices[T] { + def instantiateServices[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]({ clazz: Class[_] => clazz.getConstructor().newInstance() - } + }, jarName) - def instantiateObjects[T: ClassTag]: List[T] = instantiateServices[T] { clazz: Class[_] => { - val cons = clazz.getDeclaredConstructors()(0) - cons.setAccessible(true) - cons.newInstance() - }} + def instantiateObjects[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]( + { clazz: Class[_] => { + val cons = clazz.getDeclaredConstructors()(0) + cons.setAccessible(true) + cons.newInstance() + } + }, + jarName) } - diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala new file mode 100644 index 00000000000..4217eaa58a8 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala @@ -0,0 +1,8 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +object OsUtils { + val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 +} \ No newline at end of file diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt diff --git a/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala b/core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala b/core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala b/core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala rename to core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/LIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala b/core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala rename to core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/nn/KNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala index a4c3973a794..2acde7942bf 100644 --- a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala @@ -84,7 +84,7 @@ class KNNModel(val uid: String) extends Model[KNNModel] private var broadcastedModelOption: Option[Broadcast[BallTree[_]]] = None val ballTree = new BallTreeParam(this, "ballTree", - "the ballTree model used for perfoming queries", { _ => true }) + "the ballTree model used for performing queries", { _ => true }) def getBallTree: BallTree[_] = $(ballTree) diff --git a/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt b/core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Explode.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala similarity index 96% rename from src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala index 0e05283c7ba..2e5d435bf14 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala @@ -1,6 +1,3 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - package com.microsoft.ml.spark.stages import java.util.concurrent.LinkedBlockingQueue @@ -19,6 +16,35 @@ import scala.concurrent.blocking object PartitionConsolidator extends DefaultParamsReadable[PartitionConsolidator] +class PartitionConsolidator(val uid: String) + extends Transformer with HTTPParams with HasInputCol + with HasOutputCol + with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("PartitionConsolidator")) + + val consolidatorHolder = SharedSingleton { + new Consolidator[Row]() + } + + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + dataset.toDF().mapPartitions { it => + if (it.hasNext) { + consolidatorHolder.get.registerAndReceive(it).flatten + } else { + Iterator() + } + }(RowEncoder(dataset.schema)) + }) + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = schema +} + class Consolidator[T] { val buffer = new LinkedBlockingQueue[T]() @@ -108,36 +134,8 @@ class Consolidator[T] { } -class PartitionConsolidator(val uid: String) - extends Transformer with HTTPParams with HasInputCol - with HasOutputCol - with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("PartitionConsolidator")) - - val consolidatorHolder = SharedSingleton { - new Consolidator[Row]() - } - - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - dataset.toDF().mapPartitions { it => - if (it.hasNext) { - consolidatorHolder.get.registerAndReceive(it).flatten - } else { - Iterator() - } - }(RowEncoder(dataset.schema)) - }) - } - - override def copy(extra: ParamMap): Transformer = defaultCopy(extra) - - override def transformSchema(schema: StructType): StructType = schema -} - trait LocalAggregator[T] { def prep(iter: Iterator[Row]): T + def merge(ts: Seq[T]): T } diff --git a/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Timer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala index be12d2dcee8..889d1d85225 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala @@ -1,79 +1,79 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.stages - -import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} -import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions.udf - -import java.text.Normalizer -import com.microsoft.ml.spark.codegen.Wrappable -import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} -import com.microsoft.ml.spark.logging.BasicLogging -import org.apache.spark.sql.types.{StringType, StructField, StructType} - -object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] - -/**
UnicodeNormalize
takes a dataframe and normalizes the unicode representation.
- */
-class UnicodeNormalize(val uid: String) extends Transformer
- with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging {
- logClass()
-
- def this() = this(Identifiable.randomUID("UnicodeNormalize"))
-
- val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD")
-
- /** @group getParam */
- def getForm: String = get(form).getOrElse("NFKD")
-
- /** @group setParam */
- def setForm(value: String): this.type = {
- // check input value
- Normalizer.Form.valueOf(getForm)
-
- set("form", value)
- }
-
- val lower = new BooleanParam(this, "lower", "Lowercase text")
-
- /** @group getParam */
- def getLower: Boolean = get(lower).getOrElse(true)
-
- /** @group setParam */
- def setLower(value: Boolean): this.type = set("lower", value)
-
- /** @param dataset - The input dataset, to be transformed
- * @return The DataFrame that results from column selection
- */
- override def transform(dataset: Dataset[_]): DataFrame = {
- logTransform[DataFrame]({
- val inputIndex = dataset.columns.indexOf(getInputCol)
-
- require(inputIndex != -1, s"Input column $getInputCol does not exist")
-
- val normalizeFunc = (value: String) =>
- if (value == null) null
- else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm))
-
- val f = if (getLower)
- (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull
- else
- normalizeFunc
-
- val textMapper = udf(f)
-
- dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol))
- })
- }
-
- def transformSchema(schema: StructType): StructType = {
- schema.add(StructField(getOutputCol, StringType))
- }
-
- def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra)
-
-}
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.stages
+
+import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
+import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.functions.udf
+
+import java.text.Normalizer
+import com.microsoft.ml.spark.codegen.Wrappable
+import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol}
+import com.microsoft.ml.spark.logging.BasicLogging
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize]
+
+/** UnicodeNormalize
takes a dataframe and normalizes the unicode representation.
+ */
+class UnicodeNormalize(val uid: String) extends Transformer
+ with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging {
+ logClass()
+
+ def this() = this(Identifiable.randomUID("UnicodeNormalize"))
+
+ val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD")
+
+ /** @group getParam */
+ def getForm: String = get(form).getOrElse("NFKD")
+
+ /** @group setParam */
+ def setForm(value: String): this.type = {
+ // check input value
+ Normalizer.Form.valueOf(getForm)
+
+ set("form", value)
+ }
+
+ val lower = new BooleanParam(this, "lower", "Lowercase text")
+
+ /** @group getParam */
+ def getLower: Boolean = get(lower).getOrElse(true)
+
+ /** @group setParam */
+ def setLower(value: Boolean): this.type = set("lower", value)
+
+ /** @param dataset - The input dataset, to be transformed
+ * @return The DataFrame that results from column selection
+ */
+ override def transform(dataset: Dataset[_]): DataFrame = {
+ logTransform[DataFrame]({
+ val inputIndex = dataset.columns.indexOf(getInputCol)
+
+ require(inputIndex != -1, s"Input column $getInputCol does not exist")
+
+ val normalizeFunc = (value: String) =>
+ if (value == null) null
+ else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm))
+
+ val f = if (getLower)
+ (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull
+ else
+ normalizeFunc
+
+ val textMapper = udf(f)
+
+ dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol))
+ })
+ }
+
+ def transformSchema(schema: StructType): StructType = {
+ schema.add(StructField(getOutputCol, StringType))
+ }
+
+ def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra)
+
+}
diff --git a/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/stages/udfs.scala
rename to core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt
rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt
diff --git a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
similarity index 69%
rename from src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala
rename to core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
index ee0ba74dd41..d2d8e46bfcf 100644
--- a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala
+++ b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
@@ -1,13 +1,11 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package org.apache.spark.lightgbm
+package org.apache.spark.injections
import org.apache.spark.sql.Dataset
import org.apache.spark.storage.BlockManager
object BlockManagerUtils {
/** Returns the block manager from the dataframe's spark context.
+ *
* @param data The dataframe to get the block manager from.
* @return The block manager.
*/
diff --git a/src/main/scala/org/apache/spark/injections/RegressionUtils.scala b/core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/injections/RegressionUtils.scala
rename to core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala
diff --git a/src/main/scala/org/apache/spark/injections/SConf.scala b/core/src/main/scala/org/apache/spark/injections/SConf.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/injections/SConf.scala
rename to core/src/main/scala/org/apache/spark/injections/SConf.scala
diff --git a/src/main/scala/org/apache/spark/injections/UDFUtils.scala b/core/src/main/scala/org/apache/spark/injections/UDFUtils.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/injections/UDFUtils.scala
rename to core/src/main/scala/org/apache/spark/injections/UDFUtils.scala
diff --git a/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala b/core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala
rename to core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala
diff --git a/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala
rename to core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala
diff --git a/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/NamespaceInjections.scala
rename to core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala
diff --git a/src/main/scala/org/apache/spark/ml/Ranker.scala b/core/src/main/scala/org/apache/spark/ml/Ranker.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/Ranker.scala
rename to core/src/main/scala/org/apache/spark/ml/Ranker.scala
diff --git a/src/main/scala/org/apache/spark/ml/RegressorUtils.scala b/core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/RegressorUtils.scala
rename to core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala
diff --git a/src/main/scala/org/apache/spark/ml/Serializer.scala b/core/src/main/scala/org/apache/spark/ml/Serializer.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/Serializer.scala
rename to core/src/main/scala/org/apache/spark/ml/Serializer.scala
diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala
rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala
diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt
rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt
diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/MapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/MapParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/MapParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/TransformerParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/UDFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/UDFParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala
rename to core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala
diff --git a/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala
rename to core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala
diff --git a/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala
rename to core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala
diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala
diff --git a/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala
rename to core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala
diff --git a/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala
similarity index 100%
rename from src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala
rename to core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala
diff --git a/src/test/R/testthat.R b/core/src/test/R/testthat.R
similarity index 100%
rename from src/test/R/testthat.R
rename to core/src/test/R/testthat.R
diff --git a/src/test/R/testthat/setup-spark.R b/core/src/test/R/testthat/setup-spark.R
similarity index 100%
rename from src/test/R/testthat/setup-spark.R
rename to core/src/test/R/testthat/setup-spark.R
diff --git a/src/test/R/testthat/test-basic.R b/core/src/test/R/testthat/test-basic.R
similarity index 100%
rename from src/test/R/testthat/test-basic.R
rename to core/src/test/R/testthat/test-basic.R
diff --git a/src/test/python/LICENSE.txt b/core/src/test/python/LICENSE.txt
similarity index 100%
rename from src/test/python/LICENSE.txt
rename to core/src/test/python/LICENSE.txt
diff --git a/src/test/python/MANIFEST.in b/core/src/test/python/MANIFEST.in
similarity index 100%
rename from src/test/python/MANIFEST.in
rename to core/src/test/python/MANIFEST.in
diff --git a/src/main/python/mmlspark/lightgbm/__init__.py b/core/src/test/python/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/__init__.py
rename to core/src/test/python/__init__.py
diff --git a/src/main/python/mmlspark/nn/__init__.py b/core/src/test/python/mmlsparktest/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/nn/__init__.py
rename to core/src/test/python/mmlsparktest/__init__.py
diff --git a/src/main/python/mmlspark/opencv/__init__.py b/core/src/test/python/mmlsparktest/cyber/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/opencv/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/__init__.py
diff --git a/src/main/python/mmlspark/plot/__init__.py b/core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/plot/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py
diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py
rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py
diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py
rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py
diff --git a/src/test/python/mmlsparktest/cyber/explain_tester.py b/core/src/test/python/mmlsparktest/cyber/explain_tester.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/explain_tester.py
rename to core/src/test/python/mmlsparktest/cyber/explain_tester.py
diff --git a/src/main/python/mmlspark/stages/__init__.py b/core/src/test/python/mmlsparktest/cyber/feature/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/stages/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/feature/__init__.py
diff --git a/src/test/python/mmlsparktest/cyber/feature/test_indexers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/feature/test_indexers.py
rename to core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py
diff --git a/src/test/python/mmlsparktest/cyber/feature/test_scalers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/feature/test_scalers.py
rename to core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py
diff --git a/src/main/python/mmlspark/vw/__init__.py b/core/src/test/python/mmlsparktest/cyber/utils/__init__.py
similarity index 100%
rename from src/main/python/mmlspark/vw/__init__.py
rename to core/src/test/python/mmlsparktest/cyber/utils/__init__.py
diff --git a/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py b/core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py
rename to core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py
diff --git a/src/test/__init__.py b/core/src/test/python/mmlsparktest/nn/__init__.py
similarity index 100%
rename from src/test/__init__.py
rename to core/src/test/python/mmlsparktest/nn/__init__.py
diff --git a/src/test/python/mmlsparktest/nn/test_ball_tree.py b/core/src/test/python/mmlsparktest/nn/test_ball_tree.py
similarity index 100%
rename from src/test/python/mmlsparktest/nn/test_ball_tree.py
rename to core/src/test/python/mmlsparktest/nn/test_ball_tree.py
diff --git a/src/test/python/__init__.py b/core/src/test/python/mmlsparktest/recommendation/__init__.py
similarity index 100%
rename from src/test/python/__init__.py
rename to core/src/test/python/mmlsparktest/recommendation/__init__.py
diff --git a/src/test/python/mmlsparktest/recommendation/test_ranking.py b/core/src/test/python/mmlsparktest/recommendation/test_ranking.py
similarity index 100%
rename from src/test/python/mmlsparktest/recommendation/test_ranking.py
rename to core/src/test/python/mmlsparktest/recommendation/test_ranking.py
diff --git a/src/test/python/mmlsparktest/spark.py b/core/src/test/python/mmlsparktest/spark.py
similarity index 100%
rename from src/test/python/mmlsparktest/spark.py
rename to core/src/test/python/mmlsparktest/spark.py
diff --git a/src/test/python/setup.py b/core/src/test/python/setup.py
similarity index 100%
rename from src/test/python/setup.py
rename to core/src/test/python/setup.py
diff --git a/src/test/resources/audio1.txt b/core/src/test/resources/audio1.txt
similarity index 100%
rename from src/test/resources/audio1.txt
rename to core/src/test/resources/audio1.txt
diff --git a/src/test/resources/audio1.wav b/core/src/test/resources/audio1.wav
similarity index 100%
rename from src/test/resources/audio1.wav
rename to core/src/test/resources/audio1.wav
diff --git a/src/test/resources/audio2.txt b/core/src/test/resources/audio2.txt
similarity index 100%
rename from src/test/resources/audio2.txt
rename to core/src/test/resources/audio2.txt
diff --git a/src/test/resources/audio2.wav b/core/src/test/resources/audio2.wav
similarity index 100%
rename from src/test/resources/audio2.wav
rename to core/src/test/resources/audio2.wav
diff --git a/src/test/resources/audio3.mp3 b/core/src/test/resources/audio3.mp3
similarity index 100%
rename from src/test/resources/audio3.mp3
rename to core/src/test/resources/audio3.mp3
diff --git a/src/test/resources/audio3.txt b/core/src/test/resources/audio3.txt
similarity index 100%
rename from src/test/resources/audio3.txt
rename to core/src/test/resources/audio3.txt
diff --git a/src/test/resources/audio4.txt b/core/src/test/resources/audio4.txt
similarity index 100%
rename from src/test/resources/audio4.txt
rename to core/src/test/resources/audio4.txt
diff --git a/src/test/resources/benchmarks/benchmarkBasicDataTypes.json b/core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkBasicDataTypes.json
rename to core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json
diff --git a/src/test/resources/benchmarks/benchmarkDate.json b/core/src/test/resources/benchmarks/benchmarkDate.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkDate.json
rename to core/src/test/resources/benchmarks/benchmarkDate.json
diff --git a/src/test/resources/benchmarks/benchmarkNoOneHot.json b/core/src/test/resources/benchmarks/benchmarkNoOneHot.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkNoOneHot.json
rename to core/src/test/resources/benchmarks/benchmarkNoOneHot.json
diff --git a/src/test/resources/benchmarks/benchmarkOneHot.json b/core/src/test/resources/benchmarks/benchmarkOneHot.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkOneHot.json
rename to core/src/test/resources/benchmarks/benchmarkOneHot.json
diff --git a/src/test/resources/benchmarks/benchmarkString.json b/core/src/test/resources/benchmarks/benchmarkString.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkString.json
rename to core/src/test/resources/benchmarks/benchmarkString.json
diff --git a/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json b/core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkStringIndexOneHot.json
rename to core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json
diff --git a/src/test/resources/benchmarks/benchmarkStringMissing.json b/core/src/test/resources/benchmarks/benchmarkStringMissing.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkStringMissing.json
rename to core/src/test/resources/benchmarks/benchmarkStringMissing.json
diff --git a/src/test/resources/benchmarks/benchmarkVectors.json b/core/src/test/resources/benchmarks/benchmarkVectors.json
similarity index 100%
rename from src/test/resources/benchmarks/benchmarkVectors.json
rename to core/src/test/resources/benchmarks/benchmarkVectors.json
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv
rename to core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv
rename to core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv
rename to core/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv
diff --git a/src/test/resources/demoUsage.csv.gz b/core/src/test/resources/demoUsage.csv.gz
similarity index 100%
rename from src/test/resources/demoUsage.csv.gz
rename to core/src/test/resources/demoUsage.csv.gz
diff --git a/src/test/resources/dialogue.mp3 b/core/src/test/resources/dialogue.mp3
similarity index 100%
rename from src/test/resources/dialogue.mp3
rename to core/src/test/resources/dialogue.mp3
diff --git a/src/test/resources/lily.wav b/core/src/test/resources/lily.wav
similarity index 100%
rename from src/test/resources/lily.wav
rename to core/src/test/resources/lily.wav
diff --git a/src/test/resources/mark.wav b/core/src/test/resources/mark.wav
similarity index 100%
rename from src/test/resources/mark.wav
rename to core/src/test/resources/mark.wav
diff --git a/src/test/resources/sim_count1.csv.gz b/core/src/test/resources/sim_count1.csv.gz
similarity index 100%
rename from src/test/resources/sim_count1.csv.gz
rename to core/src/test/resources/sim_count1.csv.gz
diff --git a/src/test/resources/sim_count3.csv.gz b/core/src/test/resources/sim_count3.csv.gz
similarity index 100%
rename from src/test/resources/sim_count3.csv.gz
rename to core/src/test/resources/sim_count3.csv.gz
diff --git a/src/test/resources/sim_jac1.csv.gz b/core/src/test/resources/sim_jac1.csv.gz
similarity index 100%
rename from src/test/resources/sim_jac1.csv.gz
rename to core/src/test/resources/sim_jac1.csv.gz
diff --git a/src/test/resources/sim_jac3.csv.gz b/core/src/test/resources/sim_jac3.csv.gz
similarity index 100%
rename from src/test/resources/sim_jac3.csv.gz
rename to core/src/test/resources/sim_jac3.csv.gz
diff --git a/src/test/resources/sim_lift1.csv.gz b/core/src/test/resources/sim_lift1.csv.gz
similarity index 100%
rename from src/test/resources/sim_lift1.csv.gz
rename to core/src/test/resources/sim_lift1.csv.gz
diff --git a/src/test/resources/sim_lift3.csv.gz b/core/src/test/resources/sim_lift3.csv.gz
similarity index 100%
rename from src/test/resources/sim_lift3.csv.gz
rename to core/src/test/resources/sim_lift3.csv.gz
diff --git a/src/test/resources/user_aff.csv.gz b/core/src/test/resources/user_aff.csv.gz
similarity index 100%
rename from src/test/resources/user_aff.csv.gz
rename to core/src/test/resources/user_aff.csv.gz
diff --git a/src/test/resources/userpred_count3_userid_only.csv.gz b/core/src/test/resources/userpred_count3_userid_only.csv.gz
similarity index 100%
rename from src/test/resources/userpred_count3_userid_only.csv.gz
rename to core/src/test/resources/userpred_count3_userid_only.csv.gz
diff --git a/src/test/resources/userpred_jac3_userid_only.csv.gz b/core/src/test/resources/userpred_jac3_userid_only.csv.gz
similarity index 100%
rename from src/test/resources/userpred_jac3_userid_only.csv.gz
rename to core/src/test/resources/userpred_jac3_userid_only.csv.gz
diff --git a/src/test/resources/userpred_lift3_userid_only.csv.gz b/core/src/test/resources/userpred_lift3_userid_only.csv.gz
similarity index 100%
rename from src/test/resources/userpred_lift3_userid_only.csv.gz
rename to core/src/test/resources/userpred_lift3_userid_only.csv.gz
diff --git a/src/test/scala/com/microsoft/ml/spark/Secrets.scala b/core/src/test/scala/com/microsoft/ml/spark/Secrets.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/Secrets.scala
rename to core/src/test/scala/com/microsoft/ml/spark/Secrets.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala
rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala
rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala
diff --git a/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala
new file mode 100644
index 00000000000..6da7e4f4352
--- /dev/null
+++ b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala
@@ -0,0 +1,47 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import java.io.File
+
+import com.microsoft.ml.spark.codegen.Config._
+import com.microsoft.ml.spark.core.env.FileUtilities._
+import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing
+import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices
+import org.apache.commons.io.FileUtils
+
+
+object TestGen {
+
+ import CodeGenUtils._
+
+ def generatePythonTests(): Unit = {
+ instantiateServices[PyTestFuzzing[_]]().foreach { ltc =>
+ try {
+ ltc.makePyTestFile()
+ } catch {
+ case _: NotImplementedError =>
+ println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters")
+ }
+ }
+ }
+
+ private def makeInitFiles(packageFolder: String = ""): Unit = {
+ val dir = new File(new File(PyTestDir, "mmlsparktest"), packageFolder)
+ writeFile(new File(dir, "__init__.py"), "")
+ dir.listFiles().filter(_.isDirectory).foreach(f =>
+ makeInitFiles(packageFolder + "/" + f.getName)
+ )
+ }
+
+ def main(args: Array[String]): Unit = {
+ clean(TestDataDir)
+ clean(PyTestDir)
+ generatePythonTests()
+ TestBase.stopSparkSession()
+ FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir))
+ makeInitFiles()
+ }
+}
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala b/core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala
similarity index 98%
rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala
index ce573f761d9..67d31910dc9 100644
--- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala
@@ -257,17 +257,17 @@ class FuzzingTest extends TestBase {
// set the context loader to pick up on the jars
//Thread.currentThread().setContextClassLoader(JarLoadingUtils.classLoader)
- private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]]
+ private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]]()
- private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage]
+ private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage]()
private lazy val experimentFuzzers: List[ExperimentFuzzing[_ <: PipelineStage]] =
- JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]]
+ JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]]()
private lazy val serializationFuzzers: List[SerializationFuzzing[_ <: PipelineStage with MLWritable]] =
- JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]]
+ JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]]()
private lazy val pytestFuzzers: List[PyTestFuzzing[_ <: PipelineStage]] =
- JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]]
+ JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]]()
}
diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala b/core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala
rename to core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala
diff --git a/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala
new file mode 100644
index 00000000000..a9ae8cd4f7e
--- /dev/null
+++ b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala
@@ -0,0 +1,106 @@
+package com.microsoft.ml.spark.image
+
+import java.io.File
+import java.net.URL
+
+import com.microsoft.ml.spark.build.BuildInfo
+import com.microsoft.ml.spark.core.env.FileUtilities
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.io.IOImplicits.dfrToDfre
+import org.apache.commons.io.FileUtils
+import org.apache.spark.sql.functions.col
+
+trait ImageTestUtils extends TestBase {
+
+ val filesRoot = BuildInfo.datasetDir.toString
+ val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
+ val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString
+ val inputCol = "cntk_images"
+ val outputCol = "out"
+ val labelCol = "labels"
+
+ val featureVectorLength = 3 * 32 * 32
+ lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString
+
+ def testModelDF(spark: SparkSession): DataFrame = {
+ import spark.implicits._
+ spark.sparkContext.parallelize(Seq(
+ Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
+ -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
+ Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
+ -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
+ Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
+ 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
+ Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
+ -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
+ Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
+ 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
+ Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
+ 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
+ }
+
+ def testImages(spark: SparkSession): DataFrame = {
+ val images = spark.read.image.load(imagePath)
+
+ val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)
+
+ unroll.transform(images).select(inputCol)
+ }
+
+ def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
+ import spark.implicits._
+ if (outputDouble) {
+ List
+ .fill(rows)(List.fill(size)(0.0).toArray)
+ .zip(List.fill(rows)(0.0))
+ .toDF(inputCol, labelCol)
+ } else {
+ List
+ .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
+ .zip(List.fill(rows)(0.0))
+ .toDF(inputCol, labelCol)
+ }
+ }
+
+ protected def compareToTestModel(result: DataFrame) = {
+ //TODO improve checks
+ assert(result.columns.toSet == Set(inputCol, outputCol))
+ assert(result.count() == testModelDF(result.sparkSession).count())
+ val max = result
+ .select(outputCol)
+ .collect()
+ .map(row => row.getAs[DenseVector](0).toArray.max)
+ .max
+ assert(max < 10 & max > -10)
+ }
+
+ lazy val images: DataFrame = spark.read.image.load(imagePath)
+ .withColumnRenamed("image", inputCol)
+ lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath)
+ .select(col("value.bytes").alias(inputCol))
+
+ lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery")
+ lazy val groceryImages: DataFrame = spark.read.image
+ .option("dropInvalid", true)
+ .load(groceriesPath + "**")
+ .withColumnRenamed("image", inputCol)
+
+ lazy val greyscaleImageLocation: String = {
+ val loc = "/tmp/greyscale.jpg"
+ val f = new File(loc)
+ if (f.exists()) {f.delete()}
+ FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f)
+ loc
+ }
+
+ lazy val greyscaleImage: DataFrame = spark
+ .read.image.load(greyscaleImageLocation)
+ .select(col("image").alias(inputCol))
+
+ lazy val greyscaleBinary: DataFrame = spark
+ .read.binary.load(greyscaleImageLocation)
+ .select(col("value.bytes").alias(inputCol))
+
+}
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
similarity index 99%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
index 13592cec90b..b611ef5158e 100644
--- a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala
@@ -5,7 +5,7 @@ package com.microsoft.ml.spark.io.split1
import java.io.{File, FileInputStream}
-import com.microsoft.ml.spark.cognitive.OsUtils
+import com.microsoft.ml.spark.core.utils.OsUtils
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.schema.ImageSchemaUtils
import com.microsoft.ml.spark.core.test.base.TestBase
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala
diff --git a/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
new file mode 100644
index 00000000000..b58e597944b
--- /dev/null
+++ b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
@@ -0,0 +1,66 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.lime
+
+import breeze.linalg.{*, DenseMatrix}
+import breeze.stats.distributions.Rand
+import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing}
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.ml.param.DataFrameEquality
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.ml.util.MLReadable
+
+trait LimeTestBase extends TestBase {
+
+ import spark.implicits._
+
+ lazy val nRows = 100
+ lazy val d1 = 3
+ lazy val d2 = 1
+
+ lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0))
+ lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian)
+ lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1
+ lazy val y = x * m //+ noise
+
+ lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray))
+ lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0))
+ lazy val df = xRows.zip(yRows).toDF("features", "label")
+
+ lazy val model = new LinearRegression().fit(df)
+
+ lazy val lime = new TabularLIME()
+ .setModel(model)
+ .setInputCol("features")
+ .setPredictionCol(model.getPredictionCol)
+ .setOutputCol("out")
+ .setNSamples(1000)
+
+ lazy val limeModel = lime.fit(df)
+}
+
+class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with
+ DataFrameEquality with LimeTestBase {
+
+ test("text lime usage test check") {
+ val results = limeModel.transform(df).select("out")
+ .collect().map(_.getAs[DenseVector](0))
+ results.foreach(result => assert(result === new DenseVector(m.data)))
+ }
+
+ override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df))
+
+ override def reader: MLReadable[_] = TabularLIME
+
+ override def modelReader: MLReadable[_] = TabularLIMEModel
+}
+
+class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with
+ DataFrameEquality with LimeTestBase {
+
+ override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df))
+
+ override def reader: MLReadable[_] = TabularLIMEModel
+}
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
similarity index 96%
rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
index 5d2c26e330f..289720f9691 100644
--- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala
@@ -7,13 +7,13 @@ import java.awt.Color
import java.awt.image.BufferedImage
import java.io.File
-import com.microsoft.ml.spark.cntk.CNTKTestUtils
+import com.microsoft.ml.spark.image.ImageTestUtils
import com.microsoft.ml.spark.io.image.ImageUtils
import javax.imageio.ImageIO
import scala.util.Random
-class SuperpixelSuite extends CNTKTestUtils {
+class SuperpixelSuite extends ImageTestUtils {
lazy val sp1 = new Superpixel(img, 16, 130)
lazy val sp2 = new Superpixel(img2, 100, 130)
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
similarity index 90%
rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
index 881aefed41a..0c4a5b78d0b 100644
--- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala
@@ -4,12 +4,12 @@
package com.microsoft.ml.spark.lime
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.image.NetworkUtils
+import com.microsoft.ml.spark.image.ImageTestUtils
import com.microsoft.ml.spark.io.split1.FileReaderUtils
import org.apache.spark.ml.util.MLReadable
class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer]
- with NetworkUtils with FileReaderUtils {
+ with ImageTestUtils with FileReaderUtils {
lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol)
test("basic functionality"){
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
similarity index 98%
rename from src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
index 641afac6265..359fc06babf 100644
--- a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
+++ b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala
@@ -42,9 +42,7 @@ object DatabricksUtilities extends HasHttpClient {
val Folder = s"/MMLSparkBuild/build_${BuildInfo.version}"
// MMLSpark info
- val TruncatedScalaVersion: String = BuildInfo.scalaVersion
- .split(".".toCharArray.head).dropRight(1).mkString(".")
- val Version = s"com.microsoft.ml.spark:${BuildInfo.name}_$TruncatedScalaVersion:${BuildInfo.version}"
+ val Version = s"com.microsoft.ml.spark:mmlspark:${BuildInfo.version}"
val Repository = "https://mmlspark.azureedge.net/maven"
val Libraries: String = List(
diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala
rename to core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala
rename to core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala
rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala
diff --git a/src/main/R/model_downloader.R b/deep-learning/src/main/R/model_downloader.R
similarity index 100%
rename from src/main/R/model_downloader.R
rename to deep-learning/src/main/R/model_downloader.R
diff --git a/src/main/python/mmlspark/cntk/CNTKModel.py b/deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py
similarity index 100%
rename from src/main/python/mmlspark/cntk/CNTKModel.py
rename to deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py
diff --git a/src/test/python/mmlsparktest/__init__.py b/deep-learning/src/main/python/mmlspark/cntk/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/__init__.py
rename to deep-learning/src/main/python/mmlspark/cntk/__init__.py
diff --git a/src/main/python/mmlspark/image/ImageFeaturizer.py b/deep-learning/src/main/python/mmlspark/image/ImageFeaturizer.py
similarity index 100%
rename from src/main/python/mmlspark/image/ImageFeaturizer.py
rename to deep-learning/src/main/python/mmlspark/image/ImageFeaturizer.py
diff --git a/src/test/python/mmlsparktest/cognitive/__init__.py b/deep-learning/src/main/python/mmlspark/image/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cognitive/__init__.py
rename to deep-learning/src/main/python/mmlspark/image/__init__.py
diff --git a/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala b/deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala
similarity index 100%
rename from src/main/scala/com/microsoft/CNTK/SerializableFunction.scala
rename to deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
similarity index 91%
rename from src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
index 3b68d0ee507..54f890242b4 100644
--- a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
+++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala
@@ -7,6 +7,7 @@ import java.io._
import java.net.{URI, URL}
import java.util
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.{Configuration => HadoopConf}
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
@@ -15,10 +16,8 @@ import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
import spray.json._
-import scala.annotation.tailrec
import scala.collection.JavaConverters._
-import scala.concurrent.duration.{Duration, FiniteDuration}
-import scala.concurrent.{Await, ExecutionContext, Future}
+import scala.concurrent.duration.Duration
/** Abstract representation of a repository for future expansion
*
@@ -34,32 +33,6 @@ private[spark] abstract class Repository[S <: Schema] {
}
-object FaultToleranceUtils {
- def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={
- try {
- Await.result(Future(f)(ExecutionContext.global), timeout)
- } catch {
- case e: Exception if times >= 1 =>
- print(s"Received exception on call, retrying: $e")
- retryWithTimeout(times-1, timeout)(f)
- }
- }
-
- val Backoffs: Seq[Int] = Seq(0, 100, 200, 500)
-
- def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={
- try {
- f
- } catch {
- case e: Exception if times.nonEmpty =>
- println(s"Received exception on call, retrying: $e")
- Thread.sleep(times.head)
- retryWithTimeout(times.tail)(f)
- }
- }
-
-}
-
/** Exception returned if a repo cannot find the file
*
* @param uri : location of the file
diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
similarity index 99%
rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
index 2db42e83b0c..e7dc8c7b4a1 100644
--- a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
+++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala
@@ -132,7 +132,7 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with
/** @group getParam */
def getLayerNames: Array[String] = $(layerNames)
- setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa->true)
+ setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa -> true)
override def transform(dataset: Dataset[_]): DataFrame = {
logTransform[DataFrame]({
@@ -194,4 +194,4 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with
schema.add(getOutputCol, VectorType)
}
-}
+}
\ No newline at end of file
diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt
rename to deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt
diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
index 37b4b1ad615..f8483945360 100644
--- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala
@@ -9,11 +9,12 @@ import com.microsoft.CNTK.CNTKExtensions._
import com.microsoft.CNTK.{SerializableFunction => CNTKFunction, _}
import com.microsoft.ml.spark.core.env.StreamUtilities._
import com.microsoft.ml.spark.core.test.base.LinuxOnly
+import com.microsoft.ml.spark.image.ImageTestUtils
import org.apache.commons.io.IOUtils
import scala.collection.JavaConverters._
-class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils {
+class CNTKBindingSuite extends LinuxOnly with ImageTestUtils {
def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = {
(0 until fvv.size.toInt).map(i =>
diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
index 34893a7015c..8d2285be0ad 100644
--- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala
@@ -10,6 +10,7 @@ import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.test.base.LinuxOnly
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
+import com.microsoft.ml.spark.image.ImageTestUtils
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.DenseVector
@@ -21,7 +22,7 @@ import org.apache.spark.sql.types._
import scala.util.Random
-class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzing[CNTKModel] {
+class CNTKModelSuite extends LinuxOnly with ImageTestUtils with TransformerFuzzing[CNTKModel] {
// TODO: Move away from getTempDirectoryPath and have TestBase provide one
@@ -54,7 +55,7 @@ class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzin
.setOutputNodeIndex(0)
}
- lazy val images = testImages(spark)
+ override lazy val images = testImages(spark)
import spark.implicits._
diff --git a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
index ee6d53933a0..f67e4b82d5c 100644
--- a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala
@@ -7,6 +7,7 @@ import java.io.File
import java.nio.file.Files
import com.microsoft.ml.spark.core.test.base.TestBase
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import org.apache.commons.io.FileUtils
import scala.collection.JavaConverters._
diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
similarity index 81%
rename from src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
index 247c7a421e1..6733d1fa674 100644
--- a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala
@@ -8,24 +8,20 @@ import java.net.{URI, URL}
import com.microsoft.ml.spark.Secrets
import com.microsoft.ml.spark.build.BuildInfo
-import com.microsoft.ml.spark.cntk.CNTKTestUtils
import com.microsoft.ml.spark.core.env.FileUtilities
-import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.core.utils.ModelEquality
import com.microsoft.ml.spark.downloader.{ModelDownloader, ModelSchema}
import com.microsoft.ml.spark.io.IOImplicits._
import com.microsoft.ml.spark.io.powerbi.PowerBIWriter
import com.microsoft.ml.spark.io.split1.FileReaderUtils
-import org.apache.commons.io.FileUtils
import org.apache.spark.injections.UDFUtils
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StringType
-trait NetworkUtils extends CNTKTestUtils with FileReaderUtils {
+trait TrainedCNTKModelUtils extends ImageTestUtils with FileReaderUtils {
lazy val modelDir = new File(filesRoot, "CNTKModel")
lazy val modelDownloader = new ModelDownloader(spark, modelDir.toURI)
@@ -33,33 +29,6 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils {
lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI
lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50")
- lazy val images: DataFrame = spark.read.image.load(imagePath)
- .withColumnRenamed("image", inputCol)
- lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath)
- .select(col("value.bytes").alias(inputCol))
-
- lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery")
- lazy val groceryImages: DataFrame = spark.read.image
- .option("dropInvalid", true)
- .load(groceriesPath + "**")
- .withColumnRenamed("image", inputCol)
-
- lazy val greyscaleImageLocation: String = {
- val loc = "/tmp/greyscale.jpg"
- val f = new File(loc)
- if (f.exists()) {f.delete()}
- FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f)
- loc
- }
-
- lazy val greyscaleImage: DataFrame = spark
- .read.image.load(greyscaleImageLocation)
- .select(col("image").alias(inputCol))
-
- lazy val greyscaleBinary: DataFrame = spark
- .read.binary.load(greyscaleImageLocation)
- .select(col("value.bytes").alias(inputCol))
-
def resNetModel(): ImageFeaturizer = new ImageFeaturizer()
.setInputCol(inputCol)
.setOutputCol(outputCol)
@@ -68,7 +37,7 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils {
}
class ImageFeaturizerSuite extends TransformerFuzzing[ImageFeaturizer]
- with NetworkUtils {
+ with TrainedCNTKModelUtils {
test("Image featurizer should reproduce the CIFAR10 experiment") {
print(spark)
diff --git a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala
similarity index 65%
rename from src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
rename to deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala
index e83f910e377..892ba9823d8 100644
--- a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala
+++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala
@@ -7,82 +7,23 @@ import java.awt.image.BufferedImage
import java.io.File
import java.net.URL
-import breeze.linalg.{*, DenseMatrix}
-import breeze.stats.distributions.Rand
-import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils}
+import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
+import com.microsoft.ml.spark.image.{ImageFeaturizer, TrainedCNTKModelUtils}
import com.microsoft.ml.spark.io.IOImplicits._
import com.microsoft.ml.spark.io.image.ImageUtils
import com.microsoft.ml.spark.io.split1.FileReaderUtils
import com.microsoft.ml.spark.stages.UDFTransformer
import com.microsoft.ml.spark.stages.udfs.get_value_udf
import org.apache.commons.io.FileUtils
-import org.apache.spark.injections.UDFUtils
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.param.DataFrameEquality
-import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.ml.{NamespaceInjections, PipelineModel}
import org.apache.spark.sql.functions.col
-import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.{DataFrame, Row}
-trait LimeTestBase extends TestBase {
-
- import spark.implicits._
-
- lazy val nRows = 100
- lazy val d1 = 3
- lazy val d2 = 1
-
- lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0))
- lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian)
- lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1
- lazy val y = x * m //+ noise
-
- lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray))
- lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0))
- lazy val df = xRows.zip(yRows).toDF("features", "label")
-
- lazy val model = new LinearRegression().fit(df)
-
- lazy val lime = new TabularLIME()
- .setModel(model)
- .setInputCol("features")
- .setPredictionCol(model.getPredictionCol)
- .setOutputCol("out")
- .setNSamples(1000)
-
- lazy val limeModel = lime.fit(df)
-}
-
-class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with
- DataFrameEquality with LimeTestBase {
-
- test("text lime usage test check") {
- val results = limeModel.transform(df).select("out")
- .collect().map(_.getAs[DenseVector](0))
- results.foreach(result => assert(result === new DenseVector(m.data)))
- }
-
- override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df))
-
- override def reader: MLReadable[_] = TabularLIME
-
- override def modelReader: MLReadable[_] = TabularLIMEModel
-}
-
-class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with
- DataFrameEquality with LimeTestBase {
-
- override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df))
-
- override def reader: MLReadable[_] = TabularLIMEModel
-}
-
class ImageLIMESuite extends TransformerFuzzing[ImageLIME] with
- DataFrameEquality with NetworkUtils with FileReaderUtils {
+ DataFrameEquality with TrainedCNTKModelUtils with FileReaderUtils {
lazy val greyhoundImageLocation: String = {
val loc = "/tmp/greyhound.jpg"
diff --git a/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py
diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py
diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py
diff --git a/src/test/python/mmlsparktest/cyber/__init__.py b/lightgbm/src/main/python/mmlspark/lightgbm/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/__init__.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/__init__.py
diff --git a/src/main/python/mmlspark/lightgbm/mixin.py b/lightgbm/src/main/python/mmlspark/lightgbm/mixin.py
similarity index 100%
rename from src/main/python/mmlspark/lightgbm/mixin.py
rename to lightgbm/src/main/python/mmlspark/lightgbm/mixin.py
diff --git a/src/main/scala/com/microsoft/lightgbm/SWIG.scala b/lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala
similarity index 100%
rename from src/main/scala/com/microsoft/lightgbm/SWIG.scala
rename to lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
similarity index 99%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
index 6fc82765b8e..ccecaae33ec 100644
--- a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
+++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
@@ -8,7 +8,7 @@ import java.net._
import com.microsoft.ml.lightgbm._
import com.microsoft.ml.spark.core.env.StreamUtilities._
-import com.microsoft.ml.spark.downloader.FaultToleranceUtils
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import com.microsoft.ml.spark.lightgbm.booster.LightGBMBooster
import com.microsoft.ml.spark.lightgbm.dataset.LightGBMDataset
import com.microsoft.ml.spark.lightgbm.params.{ClassifierTrainParams, TrainParams}
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala
rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv
rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv
diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv
similarity index 100%
rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv
rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv
diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala
rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala
rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala
rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala
diff --git a/src/main/python/mmlspark/opencv/ImageTransformer.py b/opencv/src/main/python/mmlspark/opencv/ImageTransformer.py
similarity index 100%
rename from src/main/python/mmlspark/opencv/ImageTransformer.py
rename to opencv/src/main/python/mmlspark/opencv/ImageTransformer.py
diff --git a/src/test/python/mmlsparktest/cyber/anamoly/__init__.py b/opencv/src/main/python/mmlspark/opencv/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/anamoly/__init__.py
rename to opencv/src/main/python/mmlspark/opencv/__init__.py
diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala b/opencv/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala
rename to opencv/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala
rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala
rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala
rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
similarity index 97%
rename from src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
index 5d05a243ccf..b20b309bb05 100644
--- a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
+++ b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala
@@ -8,15 +8,15 @@ import java.net.URL
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
-import com.microsoft.ml.spark.opencv.{ImageTestUtils, ImageTransformer}
+import com.microsoft.ml.spark.io.IOImplicits._
+import com.microsoft.ml.spark.opencv.{ImageTransformer, OpenCVTestUtils}
+import org.apache.commons.io.FileUtils
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.{DataFrame, Row}
-import com.microsoft.ml.spark.io.IOImplicits._
-import org.apache.commons.io.FileUtils
class ResizeImageTransformerSuite extends TransformerFuzzing[ResizeImageTransformer]
- with ImageTestUtils {
+ with OpenCVTestUtils {
lazy val images: DataFrame = spark.read.image
.option("dropInvalid", true).load(FileUtilities.join(fileLocation, "**").toString)
diff --git a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
similarity index 98%
rename from src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
index 6c7ab6dfe53..62a43aa5e93 100644
--- a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
+++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala
@@ -23,7 +23,7 @@ import org.opencv.imgproc.Imgproc
import org.scalactic.Equality
import org.scalatest.Assertion
-trait ImageTestUtils {
+trait OpenCVTestUtils {
lazy protected val fileLocation = FileUtilities.join(BuildInfo.datasetDir, "Images", "Grocery")
protected def selectTestImageBytes(images: DataFrame): Array[Byte] = {
@@ -81,7 +81,7 @@ trait ImageTestUtils {
}
-class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUtils with DataFrameEquality {
+class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with OpenCVTestUtils with DataFrameEquality {
lazy val filesRoot = BuildInfo.datasetDir
lazy val imagePath = FileUtilities.join(filesRoot,"Images", "CIFAR").toString
@@ -128,7 +128,7 @@ class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUti
}
class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage]
- with ImageTestUtils with DataFrameEquality {
+ with OpenCVTestUtils with DataFrameEquality {
lazy val filesRoot = BuildInfo.datasetDir
lazy val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
@@ -163,7 +163,7 @@ class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage]
override def reader: UnrollBinaryImage.type = UnrollBinaryImage
}
-class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with ImageTestUtils {
+class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with OpenCVTestUtils {
//TODO this is needed to stop the build from freezing
override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = {
diff --git a/project/BlobMavenPlugin.scala b/project/BlobMavenPlugin.scala
new file mode 100644
index 00000000000..de8114172e0
--- /dev/null
+++ b/project/BlobMavenPlugin.scala
@@ -0,0 +1,48 @@
+import java.io.File
+
+import BlobMavenPlugin.autoImport.publishBlob
+import BuildUtils.{join, uploadToBlob}
+import sbt._
+import Keys._
+import org.apache.ivy.core.IvyPatternHelper
+
+//noinspection ScalaStyle
+object BlobMavenPlugin extends AutoPlugin {
+ override def trigger = allRequirements
+
+ object autoImport {
+ val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob")
+ val blobArtifactInfo = SettingKey[String]("blobArtifactInfo")
+ }
+
+ import autoImport._
+
+ override def requires: Plugins = sbt.Plugins.empty
+
+ override lazy val projectSettings: Seq[Setting[_]] = Seq(
+ publishBlob := {
+ publishM2.value
+ //TODO make this more general - 1.0 is a hack and not sure of a way to get this with sbt keys
+ val sourceArtifactName = s"${moduleName.value}_${scalaBinaryVersion.value}_1.0"
+ val destArtifactName = s"${moduleName.value}"
+ val repositoryDir = new File(new URI(Resolver.mavenLocal.root))
+ val orgDirs = organization.value.split(".".toCharArray.head)
+ val localPackageFolder = join(repositoryDir, orgDirs ++ Seq(sourceArtifactName, version.value):_*).toString
+ val blobMavenFolder = (orgDirs ++ Seq(destArtifactName, version.value)).mkString("/")
+ uploadToBlob(localPackageFolder, blobMavenFolder, "maven")
+ println(blobArtifactInfo.value)
+ },
+ blobArtifactInfo := {
+ s"""
+ |MMLSpark Build and Release Information
+ |---------------
+ |
+ |### Maven Coordinates
+ | `${organization.value}:${moduleName.value}:${version.value}`
+ |
+ |### Maven Resolver
+ | `https://mmlspark.azureedge.net/maven`
+ |""".stripMargin
+ }
+ )
+}
\ No newline at end of file
diff --git a/project/CodegenPlugin.scala b/project/CodegenPlugin.scala
new file mode 100644
index 00000000000..0f0270dd150
--- /dev/null
+++ b/project/CodegenPlugin.scala
@@ -0,0 +1,192 @@
+import java.io.File
+
+import BuildUtils.{join, runCmd, singleUploadToBlob, uploadToBlob, zipFolder}
+import CondaPlugin.autoImport.{activateCondaEnv, condaEnvLocation, createCondaEnvTask}
+import org.apache.commons.io.FileUtils
+import sbt.Keys._
+import sbt._
+import sbtbuildinfo.BuildInfoPlugin
+import sbtunidoc.ScalaUnidocPlugin
+
+//noinspection ScalaStyle
+object CodegenPlugin extends AutoPlugin {
+ override def trigger = allRequirements
+
+ override def requires: Plugins = CondaPlugin && BuildInfoPlugin
+
+ def rCmd(activateCondaEnv: Seq[String], cmd: Seq[String], wd: File, libPath: String): Unit = {
+ runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath))
+ }
+
+ object autoImport {
+ val pythonizedVersion = settingKey[String]("Pythonized version")
+ val rVersion = settingKey[String]("R version")
+ val genPackageNamespace = settingKey[String]("genPackageNamespace")
+ val genTestPackageNamespace = settingKey[String]("genTestPackageNamespace")
+ val genJarName = settingKey[Option[String]]("genJarName")
+
+ val targetDir = settingKey[File]("targetDir")
+ val codegenDir = settingKey[File]("codegenDir")
+
+ val codegen = TaskKey[Unit]("codegen", "Generate Code")
+ val testgen = TaskKey[Unit]("testgen", "Generate Tests")
+
+ val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package")
+ val publishR = TaskKey[Unit]("publishR", "publish R package to blob")
+ val testR = TaskKey[Unit]("testR", "Run testthat on R tests")
+
+ val packagePython = TaskKey[Unit]("packagePython", "Package python sdk")
+ val installPipPackage = TaskKey[Unit]("installPipPackage", "install python sdk")
+ val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python")
+ val publishPython = TaskKey[Unit]("publishPython", "publish python wheel")
+ val testPython = TaskKey[Unit]("testPython", "test python sdk")
+ }
+
+ import autoImport._
+ import sbtbuildinfo.BuildInfoPlugin.autoImport._
+
+ override lazy val projectSettings: Seq[Setting[_]] = Seq(
+ buildInfoKeys ++= Seq[BuildInfoKey](
+ pythonizedVersion,
+ rVersion,
+ genPackageNamespace,
+ genTestPackageNamespace,
+ targetDir,
+ codegenDir,
+ name,
+ version,
+ scalaVersion,
+ sbtVersion,
+ baseDirectory
+ ),
+ genJarName := {
+ Some(artifactName.value(
+ ScalaVersion(scalaVersion.value, scalaBinaryVersion.value),
+ projectID.value,
+ artifact.value))
+ },
+ codegen := (Def.taskDyn {
+ (Compile / compile).value
+ (Test / compile).value
+ val arg = genJarName.value.map(s => " " + s).getOrElse("")
+ Def.task {
+ (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.CodeGen$arg").value
+ }
+ }.value),
+ testgen := (Def.taskDyn {
+ (Compile / compile).value
+ (Test / compile).value
+ val arg = genJarName.value.map(s => " " + s).getOrElse("")
+ Def.task {
+ (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.TestGen$arg").value
+ }
+ }.value),
+ pythonizedVersion := {
+ if (version.value.contains("-")) {
+ version.value.split("-".head).head + ".dev1"
+ } else {
+ version.value
+ }
+ },
+ rVersion := {
+ if (version.value.contains("-")) {
+ version.value.split("-".head).head
+ } else {
+ version.value
+ }
+ },
+ packageR := {
+ createCondaEnvTask.value
+ codegen.value
+ val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value)
+ val rPackageDir = join(codegenDir.value, "package", "R")
+ val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString
+ rCmd(activateCondaEnv.value, Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath)
+ rPackageDir.mkdirs()
+ zipFolder(rSrcDir, new File(rPackageDir, s"${name.value}-${version.value}.zip"))
+ },
+ testR := {
+ packageR.value
+ publishLocal.value
+ val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString
+ val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value)
+ rCmd(activateCondaEnv.value,
+ Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", genPackageNamespace.value),
+ rSrcDir.getParentFile, libPath)
+ val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath
+ rCmd(activateCondaEnv.value,
+ Seq("Rscript", testRunner), rSrcDir, libPath)
+ },
+ publishR := {
+ codegen.value
+ packageR.value
+ val rPackageDir = join(codegenDir.value, "package", "R")
+ val rPackage = rPackageDir.listFiles().head
+ singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr")
+ },
+ packagePython := {
+ codegen.value
+ createCondaEnvTask.value
+ val destPyDir = join(targetDir.value, "classes", genPackageNamespace.value)
+ val packageDir = join(codegenDir.value, "package", "python").absolutePath
+ val pythonSrcDir = join(codegenDir.value, "src", "python")
+ if (destPyDir.exists()) FileUtils.forceDelete(destPyDir)
+ val sourcePyDir = join(pythonSrcDir.getAbsolutePath, genPackageNamespace.value)
+ FileUtils.copyDirectory(sourcePyDir, destPyDir)
+ runCmd(
+ activateCondaEnv.value ++
+ Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", packageDir),
+ pythonSrcDir)
+ },
+ installPipPackage := {
+ packagePython.value
+ publishLocal.value
+ runCmd(
+ activateCondaEnv.value ++ Seq("pip", "install", "-I",
+ s"${name.value.replace("-","_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"),
+ join(codegenDir.value, "package", "python"))
+ },
+ generatePythonDoc := {
+ installPipPackage.value
+ val dir = join(codegenDir.value, "src", "python", genPackageNamespace.value)
+ runCmd(activateCondaEnv.value ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), dir)
+ runCmd(activateCondaEnv.value ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), dir)
+ },
+ publishPython := {
+ publishLocal.value
+ packagePython.value
+ val fn = s"${name.value.replace("-","_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"
+ singleUploadToBlob(
+ join(codegenDir.value, "package", "python", fn).toString,
+ version.value + "/" + fn, "pip")
+ },
+ testPython := {
+ installPipPackage.value
+ testgen.value
+ runCmd(
+ activateCondaEnv.value ++ Seq("python",
+ "-m",
+ "pytest",
+ s"--cov=${genPackageNamespace.value}",
+ "--junitxml=../../../../python-test-results.xml",
+ "--cov-report=xml",
+ genTestPackageNamespace.value
+ ),
+ new File(codegenDir.value, "test/python/")
+ )
+ },
+ targetDir := {
+ artifactPath.in(packageBin).in(Compile).value.getParentFile
+ },
+ codegenDir := {
+ join(targetDir.value, "generated")
+ },
+ genPackageNamespace := {
+ "mmlspark"
+ },
+ genTestPackageNamespace := {
+ "mmlspark-test"
+ }
+
+ )
+}
\ No newline at end of file
diff --git a/project/CondaPlugin.scala b/project/CondaPlugin.scala
new file mode 100644
index 00000000000..66cc7c10e4f
--- /dev/null
+++ b/project/CondaPlugin.scala
@@ -0,0 +1,56 @@
+import BuildUtils.{osPrefix, runCmd}
+import sbt._
+import Keys._
+
+import scala.sys.process.Process
+
+//noinspection ScalaStyle
+object CondaPlugin extends AutoPlugin {
+ override def trigger = allRequirements
+
+ object autoImport {
+ val condaEnvName = settingKey[String]("Name of conda environment")
+ val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env")
+ val condaEnvLocation = TaskKey[File]("condaEnvLocation", "get install location of conda env")
+ val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env")
+ val activateCondaEnv = settingKey[Seq[String]]("commands to activate conda environment")
+ }
+
+ import autoImport._
+ override lazy val globalSettings: Seq[Setting[_]] = Seq(
+ condaEnvName := "mmlspark",
+ cleanCondaEnvTask := {
+ runCmd(Seq("conda", "env", "remove", "--name", condaEnvName.value, "-y"))
+ },
+ condaEnvLocation := {
+ createCondaEnvTask.value
+ new File(Process("conda env list").lineStream.toList
+ .map(_.split("\\s+"))
+ .map(l => (l.head, l.reverse.head))
+ .filter(p => p._1 == condaEnvName.value)
+ .head._2)
+ },
+ createCondaEnvTask := {
+ val hasEnv = Process("conda env list").lineStream.toList
+ .map(_.split("\\s+").head).contains(condaEnvName.value)
+ if (!hasEnv) {
+ runCmd(Seq("conda", "env", "create", "-f", "environment.yaml"))
+ } else {
+ println("Found conda env " + condaEnvName)
+ }
+ },
+ activateCondaEnv := {
+ if (sys.props("os.name").toLowerCase.contains("windows")) {
+ osPrefix ++ Seq("activate", condaEnvName.value, "&&")
+ } else {
+ Seq()
+ //TODO figure out why this doesent work
+ //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&")
+ }
+ }
+ )
+
+ override def requires: Plugins = sbt.Plugins.empty
+
+ override lazy val projectSettings: Seq[Setting[_]] = Seq()
+}
\ No newline at end of file
diff --git a/project/build.scala b/project/build.scala
index f7816cd5d48..06a930e33d1 100644
--- a/project/build.scala
+++ b/project/build.scala
@@ -2,8 +2,12 @@ import java.io.File
import java.lang.ProcessBuilder.Redirect
object BuildUtils {
+ def join(root: File, folders: String*): File = {
+ folders.foldLeft(root) { case (f, s) => new File(f, s) }
+ }
+
def join(folders: String*): File = {
- folders.tail.foldLeft(new File(folders.head)) { case (f, s) => new File(f, s) }
+ join(new File(folders.head), folders.tail: _*)
}
def isWindows: Boolean = {
@@ -27,7 +31,7 @@ object BuildUtils {
.redirectError(Redirect.INHERIT)
.redirectOutput(Redirect.INHERIT)
val env = pb.environment()
- envVars.foreach(p =>env.put(p._1,p._2))
+ envVars.foreach(p => env.put(p._1, p._2))
assert(pb.start().waitFor() == 0)
}
@@ -56,6 +60,7 @@ object BuildUtils {
"--account-key", Secrets.storageKey)
runCmd(osPrefix ++ command)
}
+
def singleUploadToBlob(source: String,
dest: String,
container: String,
@@ -76,6 +81,7 @@ object BuildUtils {
val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory)
(if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop)
}
+
loop(dir)
}
@@ -91,7 +97,9 @@ object BuildUtils {
zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/")))
val in = new BufferedInputStream(new FileInputStream(file), bufferSize)
var b = 0
- while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) }
+ while (b >= 0) {
+ zip.write(data, 0, b); b = in.read(data, 0, bufferSize)
+ }
in.close()
zip.closeEntry()
}
diff --git a/project/plugins.sbt b/project/plugins.sbt
index cc082cf59b0..6f4bd427f23 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -4,4 +4,4 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8")
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1")
addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0")
-addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
\ No newline at end of file
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
diff --git a/src/main/python/setup.py b/src/main/python/setup.py
deleted file mode 100644
index 3ba8474be22..00000000000
--- a/src/main/python/setup.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (C) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See LICENSE in project root for information.
-
-import os
-from setuptools import setup, find_packages
-import codecs
-import os.path
-
-
-def read(rel_path):
- here = os.path.abspath(os.path.dirname(__file__))
- with codecs.open(os.path.join(here, rel_path), "r") as fp:
- return fp.read()
-
-
-def get_version(rel_path):
- for line in read(rel_path).splitlines():
- if line.startswith("__version__"):
- delim = '"' if '"' in line else "'"
- return line.split(delim)[1]
- return "0.0.0"
-
-
-setup(
- name="mmlspark",
- version=get_version("mmlspark/__init__.py"),
- description="Microsoft ML for Spark",
- long_description="Microsoft ML for Apache Spark contains Microsoft's open source "
- + "contributions to the Apache Spark ecosystem",
- license="MIT",
- packages=find_packages(),
- url="https://github.com/Azure/mmlspark",
- author="Microsoft",
- author_email="mmlspark-support@microsoft.com",
- classifiers=[
- "Development Status :: 3 - Alpha",
- "Intended Audience :: Developers",
- "Intended Audience :: Data Scientists",
- "Topic :: Software Development :: Datascience Tools",
- "License :: OSI Approved :: MIT License",
- "Programming Language :: Python :: 2",
- "Programming Language :: Python :: 3",
- ],
- zip_safe=True,
- package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]},
-)
diff --git a/src/test/python/mmlsparktest/nn/__init__.py b/src/test/python/mmlsparktest/nn/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/python/mmlsparktest/recommendation/__init__.py b/src/test/python/mmlsparktest/recommendation/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/python/mmlsparktest/vw/__init__.py b/src/test/python/mmlsparktest/vw/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala b/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala
deleted file mode 100644
index 4981013301c..00000000000
--- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.cntk
-
-import java.io.File
-
-import com.microsoft.ml.spark.build.BuildInfo
-import com.microsoft.ml.spark.core.env.FileUtilities
-import com.microsoft.ml.spark.core.test.base.TestBase
-import com.microsoft.ml.spark.image.UnrollImage
-import org.apache.spark.ml.linalg.DenseVector
-import org.apache.spark.sql._
-import com.microsoft.ml.spark.io.IOImplicits._
-
-trait CNTKTestUtils extends TestBase {
-
- val filesRoot = BuildInfo.datasetDir.toString
- val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
- val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString
- val inputCol = "cntk_images"
- val outputCol = "out"
- val labelCol = "labels"
-
- val featureVectorLength = 3 * 32 * 32
- lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString
-
- def testModelDF(spark: SparkSession): DataFrame = {
- import spark.implicits._
- spark.sparkContext.parallelize(Seq(
- Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
- -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
- Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
- -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
- Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
- 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
- Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
- -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
- Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
- 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
- Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
- 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
- }
-
- def testImages(spark: SparkSession): DataFrame = {
- val images = spark.read.image.load(imagePath)
-
- val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)
-
- unroll.transform(images).select(inputCol)
- }
-
- def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
- import spark.implicits._
- if (outputDouble) {
- List
- .fill(rows)(List.fill(size)(0.0).toArray)
- .zip(List.fill(rows)(0.0))
- .toDF(inputCol, labelCol)
- } else {
- List
- .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
- .zip(List.fill(rows)(0.0))
- .toDF(inputCol, labelCol)
- }
- }
-
- protected def compareToTestModel(result: DataFrame) = {
- //TODO improve checks
- assert(result.columns.toSet == Set(inputCol, outputCol))
- assert(result.count() == testModelDF(result.sparkSession).count())
- val max = result
- .select(outputCol)
- .collect()
- .map(row => row.getAs[DenseVector](0).toArray.max)
- .max
- assert(max < 10 & max > -10)
- }
-
-}
diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py
similarity index 100%
rename from src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py
rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py
diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py
similarity index 97%
rename from src/main/python/mmlspark/vw/VowpalWabbitClassifier.py
rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py
index ba9d72dc1ee..ac33082148c 100644
--- a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py
+++ b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py
@@ -1,14 +1,14 @@
-# Copyright (C) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See LICENSE in project root for information.
-
-from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier
-from pyspark.ml.common import inherit_doc
-
-@inherit_doc
-class VowpalWabbitClassifier(_VowpalWabbitClassifier):
-
- def setInitialModel(self, model):
- """
- Initialize the estimator with a previously trained model.
- """
- self._java_obj.setInitialModel(model._java_obj.getModel())
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier
+from pyspark.ml.common import inherit_doc
+
+@inherit_doc
+class VowpalWabbitClassifier(_VowpalWabbitClassifier):
+
+ def setInitialModel(self, model):
+ """
+ Initialize the estimator with a previously trained model.
+ """
+ self._java_obj.setInitialModel(model._java_obj.getModel())
diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py
similarity index 100%
rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py
rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py
diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py
similarity index 100%
rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py
rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py
diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py
similarity index 100%
rename from src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py
rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py
diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py
similarity index 100%
rename from src/main/python/mmlspark/vw/VowpalWabbitRegressor.py
rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py
diff --git a/src/test/python/mmlsparktest/cyber/feature/__init__.py b/vw/src/main/python/mmlspark/vw/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/feature/__init__.py
rename to vw/src/main/python/mmlspark/vw/__init__.py
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala
similarity index 99%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala
index 401daeadd24..59c983aac1b 100644
--- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala
+++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala
@@ -9,7 +9,7 @@ import com.microsoft.ml.spark.codegen.Wrappable
import com.microsoft.ml.spark.core.contracts.HasWeightCol
import com.microsoft.ml.spark.core.env.StreamUtilities
import com.microsoft.ml.spark.core.utils.{ClusterUtil, StopWatch}
-import com.microsoft.ml.spark.downloader.FaultToleranceUtils
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import org.apache.spark.TaskContext
import org.apache.spark.internal._
import org.apache.spark.ml.param._
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala
similarity index 94%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala
index 46b85505e73..d0208829915 100644
--- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala
+++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala
@@ -4,15 +4,13 @@
package com.microsoft.ml.spark.vw
import com.microsoft.ml.spark.core.env.StreamUtilities
-import com.microsoft.ml.spark.downloader.FaultToleranceUtils
+import com.microsoft.ml.spark.core.utils.FaultToleranceUtils
import org.apache.spark.binary.BinaryFileFormat
-import org.apache.spark.ml.ComplexParamsWritable
-import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.ml.param.{ByteArrayParam, DataFrameParam, Param}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions.{col, struct, udf}
import org.apache.spark.sql.types.StructType
-import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitMurmur, VowpalWabbitNative}
+import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitNative}
import org.vowpalwabbit.spark.prediction.ScalarPrediction
import scala.io.Source
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala
similarity index 97%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala
index 75dd1d651ae..7ae43e536d0 100644
--- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala
+++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala
@@ -1,52 +1,52 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.vw.featurizer
-
-import org.apache.spark.sql.Row
-import org.vowpalwabbit.spark.VowpalWabbitMurmur
-
-import scala.collection.mutable
-
-/**
- * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored).
- * @param fieldIdx input field index.
- * @param columnName used as feature name.
- * @param namespaceHash pre-hashed namespace.
- * @param mask bit mask applied to final hash.
- */
-private[ml] class BooleanFeaturizer(override val fieldIdx: Int,
- override val columnName: String,
- namespaceHash: Int, mask: Int)
- extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] {
-
- /**
- * Pre-hashed feature index.
- */
- val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash)
-
- /**
- * Featurize a single row.
- * @param row input row.
- * @param indices output indices.
- * @param values output values.
- * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation.
- * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints)
- */
- override def featurize(row: Row,
- indices: mutable.ArrayBuilder[Int],
- values: mutable.ArrayBuilder[Double]): Unit = {
-
- featurize(0, row.getBoolean(fieldIdx), indices, values)
- }
-
- def featurize(idx: Int,
- value: Boolean,
- indices: mutable.ArrayBuilder[Int],
- values: mutable.ArrayBuilder[Double]): Unit = {
- if (value) {
- indices += featureIdx + idx
- values += 1.0
- }
- }
-}
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.vw.featurizer
+
+import org.apache.spark.sql.Row
+import org.vowpalwabbit.spark.VowpalWabbitMurmur
+
+import scala.collection.mutable
+
+/**
+ * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored).
+ * @param fieldIdx input field index.
+ * @param columnName used as feature name.
+ * @param namespaceHash pre-hashed namespace.
+ * @param mask bit mask applied to final hash.
+ */
+private[ml] class BooleanFeaturizer(override val fieldIdx: Int,
+ override val columnName: String,
+ namespaceHash: Int, mask: Int)
+ extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] {
+
+ /**
+ * Pre-hashed feature index.
+ */
+ val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash)
+
+ /**
+ * Featurize a single row.
+ * @param row input row.
+ * @param indices output indices.
+ * @param values output values.
+ * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation.
+ * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints)
+ */
+ override def featurize(row: Row,
+ indices: mutable.ArrayBuilder[Int],
+ values: mutable.ArrayBuilder[Double]): Unit = {
+
+ featurize(0, row.getBoolean(fieldIdx), indices, values)
+ }
+
+ def featurize(idx: Int,
+ value: Boolean,
+ indices: mutable.ArrayBuilder[Int],
+ values: mutable.ArrayBuilder[Double]): Unit = {
+ if (value) {
+ indices += featureIdx + idx
+ values += 1.0
+ }
+ }
+}
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala
similarity index 97%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala
index a8d6bf1353e..deceb8ddd7a 100644
--- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala
+++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala
@@ -1,29 +1,29 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.vw.featurizer
-
-import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix
-import org.apache.spark.sql.Row
-
-import scala.collection.mutable
-
-private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable {
-
- val columnName: String
-
- /**
- * Initialize hasher that already pre-hashes the column prefix.
- */
- protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName)
-
- /**
- * Featurize a single row.
- * @param row input row.
- * @param indices output indices.
- * @param values output values.
- * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation.
- * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints)
- */
- def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit
-}
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.vw.featurizer
+
+import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix
+import org.apache.spark.sql.Row
+
+import scala.collection.mutable
+
+private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable {
+
+ val columnName: String
+
+ /**
+ * Initialize hasher that already pre-hashes the column prefix.
+ */
+ protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName)
+
+ /**
+ * Featurize a single row.
+ * @param row input row.
+ * @param indices output indices.
+ * @param values output values.
+ * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation.
+ * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints)
+ */
+ def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit
+}
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala
similarity index 97%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala
index c7ade02c07c..cc56a1081b3 100644
--- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala
+++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala
@@ -1,61 +1,61 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.vw.featurizer
-
-import org.apache.spark.sql.Row
-import org.vowpalwabbit.spark.VowpalWabbitMurmur
-
-import scala.collection.mutable
-
-/**
- * Featurize numeric values into native VW structure. ((hash(column name):value)
- * @param fieldIdx input field index.
- * @param columnName used as feature name prefix.
- * @param namespaceHash pre-hashed namespace.
- * @param mask bit mask applied to final hash.
- */
-private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int,
- override val columnName: String,
- val namespaceHash: Int,
- val mask: Int,
- val zero: Numeric[T])
- extends Featurizer(fieldIdx) with ElementFeaturizer[T] {
-
- /**
- * Pre-hashed feature index.
- */
- val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash)
-
- override def featurize(row: Row,
- indices: mutable.ArrayBuilder[Int],
- values: mutable.ArrayBuilder[Double]): Unit = {
- featurize(0, row.getAs[T](fieldIdx), indices, values)
- }
-
- def featurize(idx: Int,
- value: T,
- indices: mutable.ArrayBuilder[Int],
- values: mutable.ArrayBuilder[Double]): Unit = {
- // Note: 0 valued features are always filtered.
- if (value != zero.zero) {
- indices += featureIdx + idx
- // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double.
- values += zero.toDouble(value)
- }
- ()
- }
-}
-
-class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int,
- override val columnName: String,
- override val namespaceHash: Int,
- override val mask: Int,
- override val zero: Numeric[T])
- extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) {
- override def featurize(row: Row,
- indices: mutable.ArrayBuilder[Int],
- values: mutable.ArrayBuilder[Double]): Unit =
- if (!row.isNullAt(fieldIdx))
- super.featurize(row, indices, values)
-}
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.vw.featurizer
+
+import org.apache.spark.sql.Row
+import org.vowpalwabbit.spark.VowpalWabbitMurmur
+
+import scala.collection.mutable
+
+/**
+ * Featurize numeric values into native VW structure. ((hash(column name):value)
+ * @param fieldIdx input field index.
+ * @param columnName used as feature name prefix.
+ * @param namespaceHash pre-hashed namespace.
+ * @param mask bit mask applied to final hash.
+ */
+private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int,
+ override val columnName: String,
+ val namespaceHash: Int,
+ val mask: Int,
+ val zero: Numeric[T])
+ extends Featurizer(fieldIdx) with ElementFeaturizer[T] {
+
+ /**
+ * Pre-hashed feature index.
+ */
+ val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash)
+
+ override def featurize(row: Row,
+ indices: mutable.ArrayBuilder[Int],
+ values: mutable.ArrayBuilder[Double]): Unit = {
+ featurize(0, row.getAs[T](fieldIdx), indices, values)
+ }
+
+ def featurize(idx: Int,
+ value: T,
+ indices: mutable.ArrayBuilder[Int],
+ values: mutable.ArrayBuilder[Double]): Unit = {
+ // Note: 0 valued features are always filtered.
+ if (value != zero.zero) {
+ indices += featureIdx + idx
+ // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double.
+ values += zero.toDouble(value)
+ }
+ ()
+ }
+}
+
+class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int,
+ override val columnName: String,
+ override val namespaceHash: Int,
+ override val mask: Int,
+ override val zero: Numeric[T])
+ extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) {
+ override def featurize(row: Row,
+ indices: mutable.ArrayBuilder[Int],
+ values: mutable.ArrayBuilder[Double]): Unit =
+ if (!row.isNullAt(fieldIdx))
+ super.featurize(row, indices, values)
+}
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala
similarity index 97%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala
index 804f6b482f2..d5821415228 100644
--- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala
+++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala
@@ -1,47 +1,47 @@
-// Copyright (C) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License. See LICENSE in project root for information.
-
-package com.microsoft.ml.spark.vw.featurizer
-
-import org.apache.spark.sql.Row
-
-import scala.collection.mutable
-
-/**
- * Featurize string into native VW structure. (hash(column name + value):1)
- * @param fieldIdx input field index.
- * @param columnName used as feature name prefix.
- * @param namespaceHash pre-hashed namespace.
- * @param mask bit mask applied to final hash.
- */
-private[ml] class StringFeaturizer(override val fieldIdx: Int,
- override val columnName: String,
- val namespaceHash: Int,
- val mask: Int)
- extends Featurizer(fieldIdx) with ElementFeaturizer[String] {
-
- /**
- * Featurize a single row.
- * @param row input row.
- * @param indices output indices.
- * @param values output values.
- * @note this interface isn't very Scala-esce, but it avoids lots of allocation.
- * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints)
- */
- override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = {
- featurize(0, row.getString(fieldIdx), indices, values)
-
- ()
- }
-
- def featurize(idx: Int,
- value: String,
- indices: mutable.ArrayBuilder[Int],
- values: mutable.ArrayBuilder[Double]): Unit = {
-
- if (value != null && !value.isEmpty) {
- indices += mask & hasher.hash(value, namespaceHash)
- values += 1.0
- }
- }
-}
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.vw.featurizer
+
+import org.apache.spark.sql.Row
+
+import scala.collection.mutable
+
+/**
+ * Featurize string into native VW structure. (hash(column name + value):1)
+ * @param fieldIdx input field index.
+ * @param columnName used as feature name prefix.
+ * @param namespaceHash pre-hashed namespace.
+ * @param mask bit mask applied to final hash.
+ */
+private[ml] class StringFeaturizer(override val fieldIdx: Int,
+ override val columnName: String,
+ val namespaceHash: Int,
+ val mask: Int)
+ extends Featurizer(fieldIdx) with ElementFeaturizer[String] {
+
+ /**
+ * Featurize a single row.
+ * @param row input row.
+ * @param indices output indices.
+ * @param values output values.
+ * @note this interface isn't very Scala-esce, but it avoids lots of allocation.
+ * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints)
+ */
+ override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = {
+ featurize(0, row.getString(fieldIdx), indices, values)
+
+ ()
+ }
+
+ def featurize(idx: Int,
+ value: String,
+ indices: mutable.ArrayBuilder[Int],
+ values: mutable.ArrayBuilder[Double]): Unit = {
+
+ if (value != null && !value.isEmpty) {
+ indices += mask & hasher.hash(value, namespaceHash)
+ values += 1.0
+ }
+ }
+}
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala
diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala
similarity index 100%
rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala
rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala
diff --git a/src/test/python/mmlsparktest/cyber/utils/__init__.py b/vw/src/test/python/mmlsparktest/vw/__init__.py
similarity index 100%
rename from src/test/python/mmlsparktest/cyber/utils/__init__.py
rename to vw/src/test/python/mmlsparktest/vw/__init__.py
diff --git a/src/test/python/mmlsparktest/vw/test_vw.py b/vw/src/test/python/mmlsparktest/vw/test_vw.py
similarity index 100%
rename from src/test/python/mmlsparktest/vw/test_vw.py
rename to vw/src/test/python/mmlsparktest/vw/test_vw.py
diff --git a/src/test/python/mmlsparktest/vw/test_vw_cb.py b/vw/src/test/python/mmlsparktest/vw/test_vw_cb.py
similarity index 100%
rename from src/test/python/mmlsparktest/vw/test_vw_cb.py
rename to vw/src/test/python/mmlsparktest/vw/test_vw_cb.py
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala
diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala
similarity index 100%
rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala
rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala