Skip to content

Commit ff6bc97

Browse files
inumanagarshajii
andauthored
SIMD & GPU updates (#704)
* Update getters/setters * Incorporate Vectron SIMD changes; Add support for parametrized attributes * Support for parametrized attributes; Fix few SIMD and Int helpers * Fix SIMD loading alignment * Auto-load plugins on import * New runtime (dlsym) GPU mode; Remove compile-time GPU dependency; dlsym improvements * Fix tests and documentation * Fix importVar timing * New SIMD library; Add __file__; Add support for string/bool compile-time defines; Fix minor issues * Fix issues with whitespace delimiters in NumPy I/O * clang-format * Refactor GPU-related standard library code * SIMD library tests & updates * Fix __file__; Move numpy outside of GPU module * Fix context fetching from another modules * Embed PTX code in LLVM module * Fix sys import * Fix macOS x86 CI target * Fix macOS x86 CI target * Fix NumPy bounds check elision optimization * Remove unused vars * Support optional output to PTX file * Remove Int.__eq__(int); add SIMD tests * Fix doc generation * Add SIMD docs * Fix SIMD tests --------- Co-authored-by: A. R. Shajii <[email protected]>
1 parent e3badf0 commit ff6bc97

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2942
-2023
lines changed

.github/build-linux/entrypoint.sh

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,12 @@ python -m pip install cython wheel astunparse
3232
python -m pip install --force-reinstall -v "numpy==2.0.2"
3333

3434
# Build Codon
35-
CODON_EXTRA=""
36-
if command -v nvcc &> /dev/null; then
37-
# Enable GPU support if CUDA is detected
38-
nvcc_version=$(nvcc --version | grep "release" | awk '{print $NF}')
39-
echo "CUDA Version: $nvcc_version"
40-
CODON_EXTRA="-DCODON_GPU=ON"
41-
fi
4235
cmake -S . -B build-${ARCH} \
4336
-G Ninja \
4437
-DCMAKE_BUILD_TYPE=Release \
4538
-DCMAKE_C_COMPILER=${COMPILER_PREFIX}clang \
4639
-DCMAKE_CXX_COMPILER=${COMPILER_PREFIX}clang++ \
47-
-DLLVM_DIR=/opt/llvm-codon/lib/cmake/llvm ${CODON_EXTRA}
40+
-DLLVM_DIR=/opt/llvm-codon/lib/cmake/llvm
4841
cmake --build build-${ARCH}
4942
cmake --install build-${ARCH} --prefix=${CODON_DIR}
5043

.github/workflows/ci.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
arch: linux-aarch64
3838
- os: ubuntu-24.04-arm
3939
arch: manylinux2014-aarch64
40-
- os: macos-13
40+
- os: macos-15-intel
4141
arch: darwin-x86_64
4242
- os: macos-14
4343
arch: darwin-arm64
@@ -50,6 +50,11 @@ jobs:
5050
steps:
5151
- uses: actions/checkout@v5
5252

53+
- name: Set up Python
54+
uses: actions/setup-python@v5
55+
with:
56+
python-version: '3.11'
57+
5358
- name: Build (Ubuntu)
5459
if: startsWith(matrix.os, 'ubuntu')
5560
run: |

CMakeLists.txt

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
1414
cmake_policy(SET CMP0135 NEW)
1515
endif()
1616

17-
option(CODON_GPU "build Codon GPU backend" OFF)
18-
1917
set(CMAKE_CXX_STANDARD 20)
2018
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
2119
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -99,7 +97,7 @@ target_compile_definitions(codonfloat PRIVATE COMPILER_RT_HAS_FLOAT16)
9997

10098
set(CODONRT_FILES codon/runtime/lib.h codon/runtime/lib.cpp
10199
codon/runtime/re.cpp codon/runtime/exc.cpp
102-
codon/runtime/gpu.cpp codon/runtime/numpy/sort.cpp
100+
codon/runtime/numpy/sort.cpp
103101
codon/runtime/numpy/loops.cpp codon/runtime/numpy/zmath.cpp)
104102
add_library(codonrt SHARED ${CODONRT_FILES})
105103
add_dependencies(codonrt zlibstatic gc backtrace bz2 liblzma
@@ -167,11 +165,6 @@ if(ASAN)
167165
codonrt PRIVATE "-fno-omit-frame-pointer" "-fsanitize=address"
168166
"-fsanitize-recover=address")
169167
endif()
170-
if(CODON_GPU)
171-
add_compile_definitions(CODON_GPU)
172-
find_package(CUDAToolkit REQUIRED)
173-
target_link_libraries(codonrt PRIVATE CUDA::cudart_static CUDA::cuda_driver)
174-
endif()
175168
add_custom_command(
176169
TARGET codonrt
177170
POST_BUILD

codon/cir/llvm/gpu.cpp

Lines changed: 65 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ const std::string GPU_DL =
1919
llvm::cl::opt<std::string>
2020
libdevice("libdevice", llvm::cl::desc("libdevice path for GPU kernels"),
2121
llvm::cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.10.bc"));
22+
llvm::cl::opt<std::string> ptxOutput("ptx",
23+
llvm::cl::desc("Output PTX to specified file"));
2224

2325
// Adapted from LLVM's GVExtractorPass, which is not externally available
2426
// as a pass for the new pass manager.
@@ -684,10 +686,9 @@ getRequiredGVs(const std::vector<llvm::GlobalValue *> &kernels) {
684686
return std::vector<llvm::GlobalValue *>(keep.begin(), keep.end());
685687
}
686688

687-
void moduleToPTX(llvm::Module *M, const std::string &filename,
688-
std::vector<llvm::GlobalValue *> &kernels,
689-
const std::string &cpuStr = "sm_30",
690-
const std::string &featuresStr = "+ptx42") {
689+
std::string moduleToPTX(llvm::Module *M, std::vector<llvm::GlobalValue *> &kernels,
690+
const std::string &cpuStr = "sm_30",
691+
const std::string &featuresStr = "+ptx42") {
691692
llvm::Triple triple(llvm::Triple::normalize(GPU_TRIPLE));
692693
llvm::TargetLibraryInfoImpl tlii(triple);
693694

@@ -792,56 +793,25 @@ void moduleToPTX(llvm::Module *M, const std::string &filename,
792793
}
793794
}
794795

795-
// Generate PTX file.
796+
// Generate PTX code.
796797
{
797-
std::error_code errcode;
798-
auto out = std::make_unique<llvm::ToolOutputFile>(filename, errcode,
799-
llvm::sys::fs::OF_Text);
800-
if (errcode)
801-
compilationError(errcode.message());
802-
llvm::raw_pwrite_stream *os = &out->os();
798+
llvm::SmallVector<char, 1024> ptx;
799+
llvm::raw_svector_ostream os(ptx);
803800

804801
auto *mmiwp = new llvm::MachineModuleInfoWrapperPass(machine.get());
805802
llvm::legacy::PassManager pm;
806803

807804
pm.add(new llvm::TargetLibraryInfoWrapperPass(tlii));
808-
seqassertn(!machine->addPassesToEmitFile(pm, *os, nullptr,
805+
bool fail = machine->addPassesToEmitFile(pm, os, nullptr,
809806
llvm::CodeGenFileType::AssemblyFile,
810-
/*DisableVerify=*/false, mmiwp),
811-
"could not add passes");
807+
/*DisableVerify=*/false, mmiwp);
808+
seqassertn(!fail, "could not add passes");
809+
812810
const_cast<llvm::TargetLoweringObjectFile *>(machine->getObjFileLowering())
813811
->Initialize(mmiwp->getMMI().getContext(), *machine);
814-
pm.run(*M);
815-
out->keep();
816-
}
817-
}
818-
819-
void addInitCall(llvm::Module *M, const std::string &filename) {
820-
llvm::LLVMContext &context = M->getContext();
821-
llvm::IRBuilder<> B(context);
822-
auto f = M->getOrInsertFunction("seq_nvptx_load_module", B.getVoidTy(), B.getPtrTy());
823-
auto *g = llvm::cast<llvm::Function>(f.getCallee());
824-
g->setDoesNotThrow();
825-
826-
auto *filenameVar = new llvm::GlobalVariable(
827-
*M, llvm::ArrayType::get(llvm::Type::getInt8Ty(context), filename.length() + 1),
828-
/*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
829-
llvm::ConstantDataArray::getString(context, filename), ".nvptx.filename");
830-
filenameVar->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
831-
832-
if (auto *init = M->getFunction("seq_init")) {
833-
seqassertn(init->hasOneUse(), "seq_init used more than once");
834-
auto *use = llvm::dyn_cast<llvm::CallBase>(init->use_begin()->getUser());
835-
seqassertn(use, "seq_init use was not a call");
836-
B.SetInsertPoint(use->getNextNode());
837-
B.CreateCall(g, B.CreateBitCast(filenameVar, B.getPtrTy()));
838-
}
839812

840-
for (auto &F : M->functions()) {
841-
if (F.hasFnAttribute("jit")) {
842-
B.SetInsertPoint(F.getEntryBlock().getFirstNonPHI());
843-
B.CreateCall(g, B.CreateBitCast(filenameVar, B.getPtrTy()));
844-
}
813+
pm.run(*M);
814+
return std::string(ptx.data(), ptx.size());
845815
}
846816
}
847817

@@ -894,16 +864,58 @@ void applyGPUTransformations(llvm::Module *M, const std::string &ptxFilename) {
894864
if (kernels.empty())
895865
return;
896866

897-
std::string filename = ptxFilename.empty() ? M->getSourceFileName() : ptxFilename;
898-
if (filename.empty() || filename[0] == '<')
899-
filename = "kernel";
900-
llvm::SmallString<128> path(filename);
901-
llvm::sys::path::replace_extension(path, "ptx");
902-
filename = path.str();
903-
904-
moduleToPTX(clone.get(), filename, kernels);
867+
auto ptx = moduleToPTX(clone.get(), kernels);
905868
cleanUpIntrinsics(M);
906-
addInitCall(M, filename);
869+
870+
if (ptxOutput.getNumOccurrences() > 0) {
871+
std::error_code err;
872+
llvm::ToolOutputFile out(ptxOutput, err, llvm::sys::fs::OF_Text);
873+
seqassertn(!err, "Could not open file: {}", err.message());
874+
llvm::raw_ostream &os = out.os();
875+
os << ptx;
876+
os.flush();
877+
out.keep();
878+
}
879+
880+
// Add ptx code as a global var
881+
auto *ptxVar = new llvm::GlobalVariable(
882+
*M, llvm::ArrayType::get(llvm::Type::getInt8Ty(context), ptx.length() + 1),
883+
/*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
884+
llvm::ConstantDataArray::getString(context, ptx), ".ptx");
885+
886+
ptxVar->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
887+
888+
// Find and patch direct calls to cuModuleLoadData()
889+
const std::string ptxTarget = "__codon_ptx__"; // must match gpu.codon name
890+
llvm::SmallVector<llvm::Instruction *, 1> callsToReplace;
891+
for (auto &F : *M) {
892+
for (auto &BB : F) {
893+
for (auto &I : BB) {
894+
auto *call = llvm::dyn_cast<llvm::CallBase>(&I);
895+
if (!call)
896+
continue;
897+
898+
auto *callee = call->getCalledFunction();
899+
if (!callee)
900+
continue;
901+
902+
if (callee->getName() == ptxTarget && call->arg_size() == 0)
903+
callsToReplace.push_back(call);
904+
}
905+
}
906+
}
907+
908+
for (auto *call : callsToReplace) {
909+
call->replaceAllUsesWith(ptxVar);
910+
call->dropAllReferences();
911+
call->eraseFromParent();
912+
}
913+
914+
// Delete __codon_ptx__() stub
915+
if (auto *F = M->getFunction(ptxTarget)) {
916+
seqassertn(F->use_empty(), "some __codon_ptx__() calls not replaced in module");
917+
F->eraseFromParent();
918+
}
907919
}
908920

909921
} // namespace ir

codon/cir/llvm/llvisitor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ const std::string INLINE_ATTR =
2828
ast::getMangledFunc("std.internal.attributes", "inline");
2929
const std::string NOINLINE_ATTR =
3030
ast::getMangledFunc("std.internal.attributes", "noinline");
31-
const std::string GPU_KERNEL_ATTR = ast::getMangledFunc("std.gpu", "kernel");
31+
const std::string GPU_KERNEL_ATTR = ast::getMangledFunc("std.internal.gpu", "kernel");
3232

3333
const std::string MAIN_UNCLASH = ".main.unclash";
3434
const std::string MAIN_CTOR = ".main.ctor";

codon/cir/module.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,7 @@ types::Type *Module::getIntNType(unsigned int len, bool sign) {
318318
}
319319

320320
types::Type *Module::getVectorType(unsigned count, types::Type *base) {
321-
return getOrRealizeType(ast::getMangledClass("std.experimental.simd", "Vec"),
322-
{base, count});
321+
return getOrRealizeType(ast::getMangledClass("std.simd", "Vec"), {base, count});
323322
}
324323

325324
types::Type *Module::getTupleType(std::vector<types::Type *> args) {

codon/cir/transform/numpy/indexing.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ namespace ir {
1313
namespace transform {
1414
namespace numpy {
1515
namespace {
16+
const std::string FUSION_MODULE = "std.numpy.fusion";
1617

1718
struct Term {
1819
enum Kind { INT, VAR, LEN } kind;
@@ -255,21 +256,24 @@ struct FindArrayIndex : public util::Operator {
255256

256257
void elideBoundsCheck(IndexInfo &index) {
257258
auto *M = index.orig->getModule();
258-
auto *data = M->Nr<ExtractInstr>(M->Nr<VarValue>(index.arr->getVar()), "_data");
259259
util::CloneVisitor cv(M);
260260

261261
if (index.item) {
262-
auto *setitem = M->getOrRealizeMethod(
263-
data->getType(), Module::SETITEM_MAGIC_NAME,
264-
{data->getType(), M->getIntType(), index.item->getType()});
265-
seqassertn(setitem, "setitem method not found");
262+
auto *setitem = M->getOrRealizeFunc(
263+
"_array1d_set_nocheck",
264+
{index.arr->getType(), M->getIntType(), index.item->getType()}, {},
265+
FUSION_MODULE);
266+
seqassertn(setitem, "setitem function not found");
266267
index.orig->replaceAll(
267-
util::call(setitem, {data, cv.clone(index.idx), cv.clone(index.item)}));
268+
util::call(setitem, {M->Nr<VarValue>(index.arr->getVar()), cv.clone(index.idx),
269+
cv.clone(index.item)}));
268270
} else {
269-
auto *getitem = M->getOrRealizeMethod(data->getType(), Module::GETITEM_MAGIC_NAME,
270-
{data->getType(), M->getIntType()});
271-
seqassertn(getitem, "getitem method not found");
272-
index.orig->replaceAll(util::call(getitem, {data, cv.clone(index.idx)}));
271+
auto *getitem =
272+
M->getOrRealizeFunc("_array1d_get_nocheck",
273+
{index.arr->getType(), M->getIntType()}, {}, FUSION_MODULE);
274+
seqassertn(getitem, "getitem function not found");
275+
index.orig->replaceAll(util::call(
276+
getitem, {M->Nr<VarValue>(index.arr->getVar()), cv.clone(index.idx)}));
273277
}
274278
}
275279

codon/cir/transform/parallel/openmp.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace transform {
1818
namespace parallel {
1919
namespace {
2020
const std::string ompModule = "std.openmp";
21-
const std::string gpuModule = "std.gpu";
21+
const std::string gpuModule = "std.internal.gpu";
2222
const std::string builtinModule = "std.internal.builtin";
2323

2424
void warn(const std::string &msg, const Value *v) {
@@ -1560,7 +1560,7 @@ void OpenMPPass::handle(ImperativeForFlow *v) {
15601560

15611561
if (sched->gpu) {
15621562
std::unordered_set<id_t> kernels;
1563-
const std::string gpuAttr = ast::getMangledFunc("std.gpu", "kernel");
1563+
const std::string gpuAttr = ast::getMangledFunc("std.internal.gpu", "kernel");
15641564
for (auto *var : *M) {
15651565
if (auto *func = cast<BodiedFunc>(var)) {
15661566
if (util::hasAttribute(func, gpuAttr)) {

codon/cir/var.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ namespace ir {
99

1010
const char Var::NodeId = 0;
1111

12+
Var::Var(types::Type *type, bool global, bool external, std::string name)
13+
: ReplaceableNodeBase(std::move(name)), type(type), global(global),
14+
external(external) {}
15+
1216
int Var::doReplaceUsedType(const std::string &name, types::Type *newType) {
1317
if (type->getName() == name) {
1418
type = newType;

codon/cir/var.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,7 @@ class Var : public ReplaceableNodeBase<Var>, public IdMixin {
3838
/// @param external true if the variable is external
3939
/// @param name the variable's name
4040
explicit Var(types::Type *type, bool global = false, bool external = false,
41-
std::string name = "")
42-
: ReplaceableNodeBase(std::move(name)), type(type), global(global),
43-
external(external) {}
44-
41+
std::string name = "");
4542
virtual ~Var() noexcept = default;
4643

4744
std::vector<Value *> getUsedValues() final { return getActual()->doGetUsedValues(); }

0 commit comments

Comments
 (0)