exaloop
diff --git a/‎.github/build-linux/entrypoint.sh‎
Lines changed: 1 addition & 8 deletions b/‎.github/build-linux/entrypoint.sh‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 8 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎codon/cir/llvm/gpu.cpp‎
Lines changed: 65 additions & 53 deletions b/‎codon/cir/llvm/gpu.cpp‎
Lines changed: 65 additions & 53 deletions
diff --git a/‎codon/cir/llvm/llvisitor.cpp‎
Lines changed: 1 addition & 1 deletion b/‎codon/cir/llvm/llvisitor.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎codon/cir/module.cpp‎
Lines changed: 1 addition & 2 deletions b/‎codon/cir/module.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎codon/cir/transform/numpy/indexing.cpp‎
Lines changed: 14 additions & 10 deletions b/‎codon/cir/transform/numpy/indexing.cpp‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎codon/cir/transform/parallel/openmp.cpp‎
Lines changed: 2 additions & 2 deletions b/‎codon/cir/transform/parallel/openmp.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎codon/cir/var.cpp‎
Lines changed: 4 additions & 0 deletions b/‎codon/cir/var.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎codon/cir/var.h‎
Lines changed: 1 addition & 4 deletions b/‎codon/cir/var.h‎
Lines changed: 1 addition & 4 deletions
@@ -32,19 +32,12 @@ python -m pip install cython wheel astunparse
 python -m pip install --force-reinstall -v "numpy==2.0.2"
 
 # Build Codon
-CODON_EXTRA=""
-if command -v nvcc &> /dev/null; then
-  # Enable GPU support if CUDA is detected
-  nvcc_version=$(nvcc --version | grep "release" | awk '{print $NF}')
-  echo "CUDA Version: $nvcc_version"
-  CODON_EXTRA="-DCODON_GPU=ON"
-fi
 cmake -S . -B build-${ARCH} \
     -G Ninja \
     -DCMAKE_BUILD_TYPE=Release \
     -DCMAKE_C_COMPILER=${COMPILER_PREFIX}clang \
     -DCMAKE_CXX_COMPILER=${COMPILER_PREFIX}clang++ \
-    -DLLVM_DIR=/opt/llvm-codon/lib/cmake/llvm ${CODON_EXTRA}
+    -DLLVM_DIR=/opt/llvm-codon/lib/cmake/llvm
 cmake --build build-${ARCH}
 cmake --install build-${ARCH} --prefix=${CODON_DIR}
 
 
@@ -37,7 +37,7 @@ jobs:
             arch: linux-aarch64
           - os: ubuntu-24.04-arm
             arch: manylinux2014-aarch64
-          - os: macos-13
+          - os: macos-15-intel
             arch: darwin-x86_64
           - os: macos-14
             arch: darwin-arm64
@@ -50,6 +50,11 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
       - name: Build (Ubuntu)
         if: startsWith(matrix.os, 'ubuntu')
         run: |
 
@@ -14,8 +14,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
   cmake_policy(SET CMP0135 NEW)
 endif()
 
-option(CODON_GPU "build Codon GPU backend" OFF)
-
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -99,7 +97,7 @@ target_compile_definitions(codonfloat PRIVATE COMPILER_RT_HAS_FLOAT16)
 
 set(CODONRT_FILES codon/runtime/lib.h codon/runtime/lib.cpp
                   codon/runtime/re.cpp codon/runtime/exc.cpp
-                  codon/runtime/gpu.cpp codon/runtime/numpy/sort.cpp
+                  codon/runtime/numpy/sort.cpp
                   codon/runtime/numpy/loops.cpp codon/runtime/numpy/zmath.cpp)
 add_library(codonrt SHARED ${CODONRT_FILES})
 add_dependencies(codonrt zlibstatic gc backtrace bz2 liblzma
@@ -167,11 +165,6 @@ if(ASAN)
     codonrt PRIVATE "-fno-omit-frame-pointer" "-fsanitize=address"
                     "-fsanitize-recover=address")
 endif()
-if(CODON_GPU)
-  add_compile_definitions(CODON_GPU)
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(codonrt PRIVATE CUDA::cudart_static CUDA::cuda_driver)
-endif()
 add_custom_command(
   TARGET codonrt
   POST_BUILD
 
@@ -19,6 +19,8 @@ const std::string GPU_DL =
 llvm::cl::opt<std::string>
     libdevice("libdevice", llvm::cl::desc("libdevice path for GPU kernels"),
               llvm::cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.10.bc"));
+llvm::cl::opt<std::string> ptxOutput("ptx",
+                                     llvm::cl::desc("Output PTX to specified file"));
 
 // Adapted from LLVM's GVExtractorPass, which is not externally available
 // as a pass for the new pass manager.
@@ -684,10 +686,9 @@ getRequiredGVs(const std::vector<llvm::GlobalValue *> &kernels) {
   return std::vector<llvm::GlobalValue *>(keep.begin(), keep.end());
 }
 
-void moduleToPTX(llvm::Module *M, const std::string &filename,
-                 std::vector<llvm::GlobalValue *> &kernels,
-                 const std::string &cpuStr = "sm_30",
-                 const std::string &featuresStr = "+ptx42") {
+std::string moduleToPTX(llvm::Module *M, std::vector<llvm::GlobalValue *> &kernels,
+                        const std::string &cpuStr = "sm_30",
+                        const std::string &featuresStr = "+ptx42") {
   llvm::Triple triple(llvm::Triple::normalize(GPU_TRIPLE));
   llvm::TargetLibraryInfoImpl tlii(triple);
 
@@ -792,56 +793,25 @@ void moduleToPTX(llvm::Module *M, const std::string &filename,
     }
   }
 
-  // Generate PTX file.
+  // Generate PTX code.
   {
-    std::error_code errcode;
-    auto out = std::make_unique<llvm::ToolOutputFile>(filename, errcode,
-                                                      llvm::sys::fs::OF_Text);
-    if (errcode)
-      compilationError(errcode.message());
-    llvm::raw_pwrite_stream *os = &out->os();
+    llvm::SmallVector<char, 1024> ptx;
+    llvm::raw_svector_ostream os(ptx);
 
     auto *mmiwp = new llvm::MachineModuleInfoWrapperPass(machine.get());
     llvm::legacy::PassManager pm;
 
     pm.add(new llvm::TargetLibraryInfoWrapperPass(tlii));
-    seqassertn(!machine->addPassesToEmitFile(pm, *os, nullptr,
+    bool fail = machine->addPassesToEmitFile(pm, os, nullptr,
                                              llvm::CodeGenFileType::AssemblyFile,
-                                             /*DisableVerify=*/false, mmiwp),
-               "could not add passes");
+                                             /*DisableVerify=*/false, mmiwp);
+    seqassertn(!fail, "could not add passes");
+
     const_cast<llvm::TargetLoweringObjectFile *>(machine->getObjFileLowering())
         ->Initialize(mmiwp->getMMI().getContext(), *machine);
-    pm.run(*M);
-    out->keep();
-  }
-}
-
-void addInitCall(llvm::Module *M, const std::string &filename) {
-  llvm::LLVMContext &context = M->getContext();
-  llvm::IRBuilder<> B(context);
-  auto f = M->getOrInsertFunction("seq_nvptx_load_module", B.getVoidTy(), B.getPtrTy());
-  auto *g = llvm::cast<llvm::Function>(f.getCallee());
-  g->setDoesNotThrow();
-
-  auto *filenameVar = new llvm::GlobalVariable(
-      *M, llvm::ArrayType::get(llvm::Type::getInt8Ty(context), filename.length() + 1),
-      /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
-      llvm::ConstantDataArray::getString(context, filename), ".nvptx.filename");
-  filenameVar->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-
-  if (auto *init = M->getFunction("seq_init")) {
-    seqassertn(init->hasOneUse(), "seq_init used more than once");
-    auto *use = llvm::dyn_cast<llvm::CallBase>(init->use_begin()->getUser());
-    seqassertn(use, "seq_init use was not a call");
-    B.SetInsertPoint(use->getNextNode());
-    B.CreateCall(g, B.CreateBitCast(filenameVar, B.getPtrTy()));
-  }
 
-  for (auto &F : M->functions()) {
-    if (F.hasFnAttribute("jit")) {
-      B.SetInsertPoint(F.getEntryBlock().getFirstNonPHI());
-      B.CreateCall(g, B.CreateBitCast(filenameVar, B.getPtrTy()));
-    }
+    pm.run(*M);
+    return std::string(ptx.data(), ptx.size());
   }
 }
 
@@ -894,16 +864,58 @@ void applyGPUTransformations(llvm::Module *M, const std::string &ptxFilename) {
   if (kernels.empty())
     return;
 
-  std::string filename = ptxFilename.empty() ? M->getSourceFileName() : ptxFilename;
-  if (filename.empty() || filename[0] == '<')
-    filename = "kernel";
-  llvm::SmallString<128> path(filename);
-  llvm::sys::path::replace_extension(path, "ptx");
-  filename = path.str();
-
-  moduleToPTX(clone.get(), filename, kernels);
+  auto ptx = moduleToPTX(clone.get(), kernels);
   cleanUpIntrinsics(M);
-  addInitCall(M, filename);
+
+  if (ptxOutput.getNumOccurrences() > 0) {
+    std::error_code err;
+    llvm::ToolOutputFile out(ptxOutput, err, llvm::sys::fs::OF_Text);
+    seqassertn(!err, "Could not open file: {}", err.message());
+    llvm::raw_ostream &os = out.os();
+    os << ptx;
+    os.flush();
+    out.keep();
+  }
+
+  // Add ptx code as a global var
+  auto *ptxVar = new llvm::GlobalVariable(
+      *M, llvm::ArrayType::get(llvm::Type::getInt8Ty(context), ptx.length() + 1),
+      /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
+      llvm::ConstantDataArray::getString(context, ptx), ".ptx");
+
+  ptxVar->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+
+  // Find and patch direct calls to cuModuleLoadData()
+  const std::string ptxTarget = "__codon_ptx__"; // must match gpu.codon name
+  llvm::SmallVector<llvm::Instruction *, 1> callsToReplace;
+  for (auto &F : *M) {
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *call = llvm::dyn_cast<llvm::CallBase>(&I);
+        if (!call)
+          continue;
+
+        auto *callee = call->getCalledFunction();
+        if (!callee)
+          continue;
+
+        if (callee->getName() == ptxTarget && call->arg_size() == 0)
+          callsToReplace.push_back(call);
+      }
+    }
+  }
+
+  for (auto *call : callsToReplace) {
+    call->replaceAllUsesWith(ptxVar);
+    call->dropAllReferences();
+    call->eraseFromParent();
+  }
+
+  // Delete __codon_ptx__() stub
+  if (auto *F = M->getFunction(ptxTarget)) {
+    seqassertn(F->use_empty(), "some __codon_ptx__() calls not replaced in module");
+    F->eraseFromParent();
+  }
 }
 
 } // namespace ir
 
@@ -28,7 +28,7 @@ const std::string INLINE_ATTR =
     ast::getMangledFunc("std.internal.attributes", "inline");
 const std::string NOINLINE_ATTR =
     ast::getMangledFunc("std.internal.attributes", "noinline");
-const std::string GPU_KERNEL_ATTR = ast::getMangledFunc("std.gpu", "kernel");
+const std::string GPU_KERNEL_ATTR = ast::getMangledFunc("std.internal.gpu", "kernel");
 
 const std::string MAIN_UNCLASH = ".main.unclash";
 const std::string MAIN_CTOR = ".main.ctor";
 
@@ -318,8 +318,7 @@ types::Type *Module::getIntNType(unsigned int len, bool sign) {
 }
 
 types::Type *Module::getVectorType(unsigned count, types::Type *base) {
-  return getOrRealizeType(ast::getMangledClass("std.experimental.simd", "Vec"),
-                          {base, count});
+  return getOrRealizeType(ast::getMangledClass("std.simd", "Vec"), {base, count});
 }
 
 types::Type *Module::getTupleType(std::vector<types::Type *> args) {
 
@@ -13,6 +13,7 @@ namespace ir {
 namespace transform {
 namespace numpy {
 namespace {
+const std::string FUSION_MODULE = "std.numpy.fusion";
 
 struct Term {
   enum Kind { INT, VAR, LEN } kind;
@@ -255,21 +256,24 @@ struct FindArrayIndex : public util::Operator {
 
 void elideBoundsCheck(IndexInfo &index) {
   auto *M = index.orig->getModule();
-  auto *data = M->Nr<ExtractInstr>(M->Nr<VarValue>(index.arr->getVar()), "_data");
   util::CloneVisitor cv(M);
 
   if (index.item) {
-    auto *setitem = M->getOrRealizeMethod(
-        data->getType(), Module::SETITEM_MAGIC_NAME,
-        {data->getType(), M->getIntType(), index.item->getType()});
-    seqassertn(setitem, "setitem method not found");
+    auto *setitem = M->getOrRealizeFunc(
+        "_array1d_set_nocheck",
+        {index.arr->getType(), M->getIntType(), index.item->getType()}, {},
+        FUSION_MODULE);
+    seqassertn(setitem, "setitem function not found");
     index.orig->replaceAll(
-        util::call(setitem, {data, cv.clone(index.idx), cv.clone(index.item)}));
+        util::call(setitem, {M->Nr<VarValue>(index.arr->getVar()), cv.clone(index.idx),
+                             cv.clone(index.item)}));
   } else {
-    auto *getitem = M->getOrRealizeMethod(data->getType(), Module::GETITEM_MAGIC_NAME,
-                                          {data->getType(), M->getIntType()});
-    seqassertn(getitem, "getitem method not found");
-    index.orig->replaceAll(util::call(getitem, {data, cv.clone(index.idx)}));
+    auto *getitem =
+        M->getOrRealizeFunc("_array1d_get_nocheck",
+                            {index.arr->getType(), M->getIntType()}, {}, FUSION_MODULE);
+    seqassertn(getitem, "getitem function not found");
+    index.orig->replaceAll(util::call(
+        getitem, {M->Nr<VarValue>(index.arr->getVar()), cv.clone(index.idx)}));
   }
 }
 
 
@@ -18,7 +18,7 @@ namespace transform {
 namespace parallel {
 namespace {
 const std::string ompModule = "std.openmp";
-const std::string gpuModule = "std.gpu";
+const std::string gpuModule = "std.internal.gpu";
 const std::string builtinModule = "std.internal.builtin";
 
 void warn(const std::string &msg, const Value *v) {
@@ -1560,7 +1560,7 @@ void OpenMPPass::handle(ImperativeForFlow *v) {
 
   if (sched->gpu) {
     std::unordered_set<id_t> kernels;
-    const std::string gpuAttr = ast::getMangledFunc("std.gpu", "kernel");
+    const std::string gpuAttr = ast::getMangledFunc("std.internal.gpu", "kernel");
     for (auto *var : *M) {
       if (auto *func = cast<BodiedFunc>(var)) {
         if (util::hasAttribute(func, gpuAttr)) {
 
@@ -9,6 +9,10 @@ namespace ir {
 
 const char Var::NodeId = 0;
 
+Var::Var(types::Type *type, bool global, bool external, std::string name)
+    : ReplaceableNodeBase(std::move(name)), type(type), global(global),
+      external(external) {}
+
 int Var::doReplaceUsedType(const std::string &name, types::Type *newType) {
   if (type->getName() == name) {
     type = newType;
 
@@ -38,10 +38,7 @@ class Var : public ReplaceableNodeBase<Var>, public IdMixin {
   /// @param external true if the variable is external
   /// @param name the variable's name
   explicit Var(types::Type *type, bool global = false, bool external = false,
-               std::string name = "")
-      : ReplaceableNodeBase(std::move(name)), type(type), global(global),
-        external(external) {}
-
+               std::string name = "");
   virtual ~Var() noexcept = default;
 
   std::vector<Value *> getUsedValues() final { return getActual()->doGetUsedValues(); }
Original file line number	Diff line number	Diff line change
`@@ -318,8 +318,7 @@ types::Type *Module::getIntNType(unsigned int len, bool sign) {`
`318`	`318`	`}`
`319`	`319`
`320`	`320`	`types::Type Module::getVectorType(unsigned count, types::Type base) {`
`321`		`- return getOrRealizeType(ast::getMangledClass("std.experimental.simd", "Vec"),`
`322`		`- {base, count});`
	`321`	`+ return getOrRealizeType(ast::getMangledClass("std.simd", "Vec"), {base, count});`
`323`	`322`	`}`
`324`	`323`
`325`	`324`	`types::Type Module::getTupleType(std::vector<types::Type > args) {`