diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -561,6 +561,17 @@ // instruction, so we say that ctlz is cheap to speculate. bool isCheapToSpeculateCtlz() const override { return true; } + AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override { + return AtomicExpansionKind::None; + } + + AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override { + return AtomicExpansionKind::None; + } + + AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + private: const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5125,6 +5125,61 @@ } } +NVPTXTargetLowering::AtomicExpansionKind +NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + Type *Ty = AI->getValOperand()->getType(); + + if (AI->isFloatingPointOperation()) { + if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { + if (Ty->isFloatTy()) + return AtomicExpansionKind::None; + if (Ty->isDoubleTy() && STI.hasAtomAddF64()) + return AtomicExpansionKind::None; + } + return AtomicExpansionKind::CmpXChg; + } + + assert(Ty->isIntegerTy() && "Ty should be integer at this point"); + auto ITy = cast(Ty); + + switch (AI->getOperation()) { + default: + return AtomicExpansionKind::CmpXChg; + case AtomicRMWInst::BinOp::And: + case AtomicRMWInst::BinOp::Or: + case AtomicRMWInst::BinOp::Xor: + case AtomicRMWInst::BinOp::Xchg: + switch (ITy->getBitWidth()) { + case 32: + return AtomicExpansionKind::None; + case 64: + if (STI.hasAtomBitwise64()) + return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; + default: + llvm_unreachable("unsupported width encountered"); + } + case AtomicRMWInst::BinOp::Add: + case AtomicRMWInst::BinOp::Sub: + case AtomicRMWInst::BinOp::Max: + case AtomicRMWInst::BinOp::Min: + case AtomicRMWInst::BinOp::UMax: + case AtomicRMWInst::BinOp::UMin: + switch (ITy->getBitWidth()) { + case 32: + return AtomicExpansionKind::None; + case 64: + if (STI.hasAtomMinMax64()) + return AtomicExpansionKind::None; + return AtomicExpansionKind::CmpXChg; + default: + llvm_unreachable("unsupported width encountered"); + } + } + + return AtomicExpansionKind::CmpXChg; +} + // Pin NVPTXTargetObjectFile's vtables to this file. NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -330,6 +330,8 @@ addStraightLineScalarOptimizationPasses(); } + addPass(createAtomicExpandPass()); + // === LSR and other generic IR passes === TargetPassConfig::addIRPasses(); // EarlyCSE is not always strong enough to clean up what LSR produces. For diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll @@ -0,0 +1,127 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=ALL,SM30 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=ALL,SM60 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify %} + +; CHECK-LABEL: fadd_double +define void @fadd_double(ptr %0, double %1) { +entry: + ; SM30: atom.cas.b64 + ; SM60: atom.add.f64 + %2 = atomicrmw fadd ptr %0, double %1 monotonic, align 8 + ret void +} + +; CHECK-LABEL: fadd_float +define void @fadd_float(ptr %0, float %1) { +entry: + ; ALL: atom.add.f32 + %2 = atomicrmw fadd ptr %0, float %1 monotonic, align 4 + ret void +} + +; CHECK-LABEL: bitwise_i32 +define void @bitwise_i32(ptr %0, i32 %1) { +entry: + ; ALL: atom.and.b32 + %2 = atomicrmw and ptr %0, i32 %1 monotonic, align 4 + ; ALL: atom.or.b32 + %3 = atomicrmw or ptr %0, i32 %1 monotonic, align 4 + ; ALL: atom.xor.b32 + %4 = atomicrmw xor ptr %0, i32 %1 monotonic, align 4 + ; ALL: atom.exch.b32 + %5 = atomicrmw xchg ptr %0, i32 %1 monotonic, align 4 + ret void +} + +; CHECK-LABEL: bitwise_i64 +define void @bitwise_i64(ptr %0, i64 %1) { +entry: + ; SM30: atom.cas.b64 + ; SM60: atom.and.b64 + %2 = atomicrmw and ptr %0, i64 %1 monotonic, align 8 + ; SM30: atom.cas.b64 + ; SM60: atom.or.b64 + %3 = atomicrmw or ptr %0, i64 %1 monotonic, align 8 + ; SM30: atom.cas.b64 + ; SM60: atom.xor.b64 + %4 = atomicrmw xor ptr %0, i64 %1 monotonic, align 8 + ; SM30: atom.cas.b64 + ; SM60: atom.exch.b64 + %5 = atomicrmw xchg ptr %0, i64 %1 monotonic, align 8 + ret void +} + +; CHECK-LABEL: minmax_i32 +define void @minmax_i32(ptr %0, i32 %1) { +entry: + ; ALL: atom.min.s32 + %2 = atomicrmw min ptr %0, i32 %1 monotonic, align 4 + ; ALL: atom.max.s32 + %3 = atomicrmw max ptr %0, i32 %1 monotonic, align 4 + ; ALL: atom.min.u32 + %4 = atomicrmw umin ptr %0, i32 %1 monotonic, align 4 + ; ALL: atom.max.u32 + %5 = atomicrmw umax ptr %0, i32 %1 monotonic, align 4 + ret void +} + +; CHECK-LABEL: minmax_i64 +define void @minmax_i64(ptr %0, i64 %1) { +entry: + ; SM30: atom.cas.b64 + ; SM60: atom.min.s64 + %2 = atomicrmw min ptr %0, i64 %1 monotonic, align 8 + ; SM30: atom.cas.b64 + ; SM60: atom.max.s64 + %3 = atomicrmw max ptr %0, i64 %1 monotonic, align 8 + ; SM30: atom.cas.b64 + ; SM60: atom.min.u64 + %4 = atomicrmw umin ptr %0, i64 %1 monotonic, align 8 + ; SM30: atom.cas.b64 + ; SM60: atom.max.u64 + %5 = atomicrmw umax ptr %0, i64 %1 monotonic, align 8 + ret void +} + +; TODO: We might still want to test other types, such as i8, i16, Currently the +; backend doesn't support them. Atomic expand only supports expansion to cas of +; the same bitwidth, which means even after expansion, the back end still +; doesn't support the instruction. Here we still put the tests. Remove the +; comment once we have proper support, either from atomic expand or backend. + +; define void @bitwise_i8(ptr %0, i8 %1) { +; entry: +; %2 = atomicrmw and ptr %0, i8 %1 monotonic, align 1 +; %3 = atomicrmw or ptr %0, i8 %1 monotonic, align 1 +; %4 = atomicrmw xor ptr %0, i8 %1 monotonic, align 1 +; %5 = atomicrmw xchg ptr %0, i8 %1 monotonic, align 1 +; ret void +; } + +; define void @minmax_i8(ptr %0, i8 %1) { +; entry: +; %2 = atomicrmw min ptr %0, i8 %1 monotonic, align 1 +; %3 = atomicrmw max ptr %0, i8 %1 monotonic, align 1 +; %4 = atomicrmw umin ptr %0, i8 %1 monotonic, align 1 +; %5 = atomicrmw umax ptr %0, i8 %1 monotonic, align 1 +; ret void +; } + +; define void @bitwise_i16(ptr %0, i16 %1) { +; entry: +; %2 = atomicrmw and ptr %0, i16 %1 monotonic, align 2 +; %3 = atomicrmw or ptr %0, i16 %1 monotonic, align 2 +; %4 = atomicrmw xor ptr %0, i16 %1 monotonic, align 2 +; %5 = atomicrmw xchg ptr %0, i16 %1 monotonic, align 2 +; ret void +; } + +; define void @minmax_i16(ptr %0, i16 %1) { +; entry: +; %2 = atomicrmw min ptr %0, i16 %1 monotonic, align 2 +; %3 = atomicrmw max ptr %0, i16 %1 monotonic, align 2 +; %4 = atomicrmw umin ptr %0, i16 %1 monotonic, align 2 +; %5 = atomicrmw umax ptr %0, i16 %1 monotonic, align 2 +; ret void +; }