From 6a858b26e5c2d6827a726061a271fcfe557bbc0a Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Thu, 9 Jan 2025 15:12:49 -0800 Subject: [PATCH] fix(wave): NVRTC remote execution Differential Revision: D67990177 --- velox/experimental/wave/common/CMakeLists.txt | 2 + velox/experimental/wave/common/Compile.cu | 73 ++++--------------- velox/experimental/wave/common/StringView.h | 4 +- velox/experimental/wave/exec/Aggregation.cpp | 2 +- velox/experimental/wave/exec/ToWave.h | 8 +- velox/experimental/wave/exec/Wave.cpp | 6 +- velox/experimental/wave/exec/Wave.h | 2 +- velox/experimental/wave/jit/Headers.h | 4 +- 8 files changed, 26 insertions(+), 75 deletions(-) diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt index be2d0763161a..cb20c7c5840e 100644 --- a/velox/experimental/wave/common/CMakeLists.txt +++ b/velox/experimental/wave/common/CMakeLists.txt @@ -32,6 +32,8 @@ target_link_libraries( CUDA::nvrtc CUDA::cudart) +target_compile_definitions(velox_wave_common PRIVATE VELOX_OSS_BUILD=1) + if(${VELOX_BUILD_TESTING}) add_subdirectory(tests) endif() diff --git a/velox/experimental/wave/common/Compile.cu b/velox/experimental/wave/common/Compile.cu index 39b2575602ee..c461ee5ba19c 100644 --- a/velox/experimental/wave/common/Compile.cu +++ b/velox/experimental/wave/common/Compile.cu @@ -27,6 +27,10 @@ #include "velox/experimental/wave/jit/Headers.h" #include "velox/external/jitify/jitify.hpp" +#ifndef VELOX_OSS_BUILD +#include "velox/facebook/NvrtcUtil.h" +#endif + namespace facebook::velox::wave { void nvrtcCheck(nvrtcResult result) { @@ -75,6 +79,14 @@ void addFlag( data.push_back(std::move(str)); } +#ifdef VELOX_OSS_BUILD +void getDefaultNvrtcOptions(std::vector& data) { + constexpr const char* kUsrLocalCuda = "/usr/local/cuda/include"; + LOG(INFO) << "Using " << kUsrLocalCuda; + addFlag("-I", kUsrLocalCuda, strlen(kUsrLocalCuda), data); +} +#endif + // Gets compiler options from the environment and appends them to 'data'. void getNvrtcOptions(std::vector& data) { const char* includes = getenv("WAVE_NVRTC_INCLUDE_PATH"); @@ -90,66 +102,7 @@ void getNvrtcOptions(std::vector& data) { includes = end + 1; } } else { - std::string currentPath = std::filesystem::current_path().c_str(); - LOG(INFO) << "Looking for Cuda includes. cwd=" << currentPath - << " Cuda=" << __CUDA_API_VER_MAJOR__ << "." - << __CUDA_API_VER_MINOR__; - auto pathCStr = currentPath.c_str(); - if (auto fbsource = strstr(pathCStr, "fbsource")) { - // fbcode has cuda includes in fbsource/third-party/cuda/... - try { - auto fbsourcePath = - std::string(pathCStr, fbsource - pathCStr + strlen("fbsource")) + - "/third-party/cuda"; - LOG(INFO) << "Guessing fbsource path =" << fbsourcePath; - auto tempPath = fmt::format("/tmp/cuda.{}", getpid()); - auto command = fmt::format( - "(cd {}; du |grep \"{}\\.{}.*x64-linux.*/cuda$\" |grep -v thrust) >{}", - fbsourcePath, - __CUDA_API_VER_MAJOR__, - __CUDA_API_VER_MINOR__, - tempPath); - LOG(INFO) << "Running " << command; - system(command.c_str()); - std::ifstream result(tempPath); - std::string line; - if (!std::getline(result, line)) { - LOG(ERROR) - << "Cuda includes matching build version not found in fbcode/third-party. Looking for latest cuda."; - command = fmt::format( - "(cd {}; du |grep \"{}\.*x64-linux.*/cuda$\" |grep -v thrust | sort -r) >{}", - fbsourcePath, - __CUDA_API_VER_MAJOR__, - tempPath); - LOG(INFO) << "Running " << command; - system(command.c_str()); - std::ifstream result(tempPath); - if (!std::getline(result, line)) { - LOG(ERROR) << "Did not find any cuda with the same major version"; - return; - } - } - - LOG(INFO) << "Got cuda line: " << line; - // Now trim the size and the trailing /cuda from the line. - const char* start = strstr(line.c_str(), "./"); - if (!start) { - LOG(ERROR) << "Line " << line << " does not have ./"; - return; - } - auto path = fbsourcePath + "/" + (start + 2); - // We add the cwd + the found path minus the trailing /cuda. - addFlag("-I", path.c_str(), path.size() - 5, data); - } catch (const std::exception& e) { - LOG(ERROR) << "Failed to infer fbcode Cuda include path: " << e.what(); - } - } else { - addFlag( - "-I", - "/usr/local/cuda/include", - strlen("/usr/local/cuda/include"), - data); - } + getDefaultNvrtcOptions(data); } const char* flags = getenv("WAVE_NVRTC_FLAGS"); if (flags && strlen(flags)) { diff --git a/velox/experimental/wave/common/StringView.h b/velox/experimental/wave/common/StringView.h index 939f9548f7e8..1a68ed116f9b 100644 --- a/velox/experimental/wave/common/StringView.h +++ b/velox/experimental/wave/common/StringView.h @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include "velox/experimental/wave/common/CompilerDefines.h" namespace facebook::velox::wave { diff --git a/velox/experimental/wave/exec/Aggregation.cpp b/velox/experimental/wave/exec/Aggregation.cpp index 66d9d84d7882..57aebb991289 100644 --- a/velox/experimental/wave/exec/Aggregation.cpp +++ b/velox/experimental/wave/exec/Aggregation.cpp @@ -81,7 +81,7 @@ Aggregation::Aggregation( const std::shared_ptr& functionRegistry) : WaveOperator(state, node.outputType(), node.id()), - arena_(&state.arena()), + arena_(state.arena()), functionRegistry_(functionRegistry) { VELOX_CHECK(node.step() == core::AggregationNode::Step::kSingle); VELOX_CHECK(node.preGroupedKeys().empty()); diff --git a/velox/experimental/wave/exec/ToWave.h b/velox/experimental/wave/exec/ToWave.h index d9d181cf5a46..df3900cd185d 100644 --- a/velox/experimental/wave/exec/ToWave.h +++ b/velox/experimental/wave/exec/ToWave.h @@ -745,18 +745,14 @@ class CompileState { addExprSet(const exec::ExprSet& set, int32_t begin, int32_t end); std::vector> makeLevels(int32_t startIndex); - GpuArena& arena() const { - return *arena_; + GpuArena* arena() const { + return arena_.get(); } int numOperators() const { return operators_.size(); } - GpuArena& arena() { - return *arena_; - } - std::stringstream& generated() { return generated_; } diff --git a/velox/experimental/wave/exec/Wave.cpp b/velox/experimental/wave/exec/Wave.cpp index 073f9f29c1f9..235ba87840dc 100644 --- a/velox/experimental/wave/exec/Wave.cpp +++ b/velox/experimental/wave/exec/Wave.cpp @@ -1175,8 +1175,8 @@ void Program::callUpdateStatus(WaveStream& stream, AdvanceResult& advance) { #define IN_OPERAND(member) \ physicalInst->member = operandIndex(abstractInst->member) -void Program::prepareForDevice(GpuArena& arena) { - arena_ = &arena; +void Program::prepareForDevice(GpuArena* arena) { + arena_ = arena; if (kernel_) { return; } @@ -1270,7 +1270,7 @@ void Program::prepareForDevice(GpuArena& arena) { "OpCode {}", static_cast(instruction->opCode)); } sortSlots(); - deviceData_ = arena.allocate( + deviceData_ = arena->allocate( codeSize + literalArea_.size() + sizeof(ThreadBlockProgram)); uintptr_t end = reinterpret_cast( deviceData_->as() + deviceData_->size()); diff --git a/velox/experimental/wave/exec/Wave.h b/velox/experimental/wave/exec/Wave.h index 2dcad0bb155a..c55f75cb9242 100644 --- a/velox/experimental/wave/exec/Wave.h +++ b/velox/experimental/wave/exec/Wave.h @@ -524,7 +524,7 @@ class Program : public std::enable_shared_from_this { // Initializes executableImage and relocation information and places // the result on device. - void prepareForDevice(GpuArena& arena); + void prepareForDevice(GpuArena* arena); std::unique_ptr getExecutable( int32_t maxRows, diff --git a/velox/experimental/wave/jit/Headers.h b/velox/experimental/wave/jit/Headers.h index ffdefa6125ae..4cefcd17eaa6 100644 --- a/velox/experimental/wave/jit/Headers.h +++ b/velox/experimental/wave/jit/Headers.h @@ -2178,8 +2178,8 @@ const char* velox_experimental_wave_common_StringView_h = "\n" "#pragma once\n" "\n" - "#include \n" - "#include \n" + "#include \n" + "#include \n" "#include \"velox/experimental/wave/common/CompilerDefines.h\"\n" "\n" "namespace facebook::velox::wave {\n"