Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -20,9 +20,9 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -1068,8 +1068,26 @@ FMF.setFast(); Builder.setFastMathFlags(FMF); - if (divHasSpecialOptimization(I, X, Y)) + if (divHasSpecialOptimization(I, X, Y)) { + // Special case for wave ID expression. The divide result is known wave + // uniform, but the source numerator is not. Hack in a readfirstlane to + // inform codegen of this. + // + // FIXME: Ideally the codegen divergence analysis would recognize the same + // pattern. + if (DA->isUniform(&I) && !DA->isUniform(I.getOperand(0))) { + Function *Readfirstlane = + Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_readfirstlane); + + // Clone the operation to simplify the value replacement. + Value *NewOp = Builder.CreateBinOp(I.getOpcode(), X, Y, I.getName()); + cast(NewOp)->copyIRFlags(&I); + + return Builder.CreateCall(Readfirstlane, {NewOp}); + } + return nullptr; // Keep it for later optimization. + } bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/KnownBits.h" using namespace llvm; +using namespace PatternMatch; #define DEBUG_TYPE "AMDGPUtti" @@ -898,6 +899,26 @@ return false; } +/// Recognize a load from the offsets where workgroup sizes are stored in the +/// dispatch packet. +static bool isLoadFromGroupSize(const DataLayout &DL, Value *V, + unsigned GroupIdx) { + Value *ZextSrc = nullptr; + if (!match(V, m_ZExt(m_Value(ZextSrc))) || + !ZextSrc->getType()->isIntegerTy(16)) + return false; + + LoadInst *LI = dyn_cast(ZextSrc); + if (!LI) + return false; + + int64_t Offset = 0; + Value *PtrBase = + GetPointerBaseWithConstantOffset(LI->getPointerOperand(), Offset, DL); + return match(PtrBase, m_Intrinsic()) && + Offset == 4 + 2 * GroupIdx; +} + bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { switch (Intrinsic->getIntrinsicID()) { @@ -919,6 +940,51 @@ return false; } + if (const Instruction *Inst = dyn_cast(V)) { + switch (Inst->getOpcode()) { + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::URem: { + // Special case for pattern used for wave indexing calculations + // (blockIdx * blockDim + threadId) / wavesize + // (blockIdx * blockDim + threadId) % wavesize + + Instruction *Num; + if (match(Inst, m_BinOp(m_Instruction(Num), + m_SpecificInt(ST->getWavefrontSize())))) { + const DataLayout &DL = Inst->getModule()->getDataLayout(); + for (unsigned Idx = 0; Idx < 3; ++Idx) { + auto ItemIDIntrin = + Idx == 0 + ? m_Intrinsic() + : (Idx == 1 ? m_Intrinsic() + : m_Intrinsic()); + + auto GroupIDIntrin = + Idx == 0 + ? m_Intrinsic() + : (Idx == 1 + ? m_Intrinsic() + : m_Intrinsic()); + + Value *MulLHS; + // Match the expression with the uniform-work-group-size optimization + // applied. + if (match(Num, m_Add(m_Mul(m_Value(MulLHS), GroupIDIntrin), + ItemIDIntrin)) && + isLoadFromGroupSize(DL, MulLHS, Idx)) + return true; + } + } + + break; + } + default: + break; + } + } + const ExtractValueInst *ExtValue = dyn_cast(V); if (!ExtValue) return false; Index: llvm/test/Analysis/DivergenceAnalysis/AMDGPU/wave-id-computation.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/DivergenceAnalysis/AMDGPU/wave-id-computation.ll @@ -0,0 +1,387 @@ +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1031 -passes='amdgpu-lower-kernel-attributes,dce,print' -disable-output %s 2>&1 | FileCheck -match-full-lines %s + +; Specially recognize workitem ID division by wavesize as uniform. Issue 54010. + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = sdiv exact i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_wid_x() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv exact i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_x': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = srem i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_lid_x() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = srem i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_y': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_wid_y() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.y() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 6 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 2, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 16 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 8 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_y': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = srem i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_lid_y() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.y() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 6 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 2, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 16 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 8 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = srem i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_z': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_wid_z() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.z() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 8 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 20 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_z': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = srem i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_lid_z() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.z() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 8 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 20 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.z(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = srem i32 %i14, 32 + ret i32 %i15 +} + +; FIXME: Should be recognized as uniform +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_commute_mul': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: DIVERGENT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: DIVERGENT: ret i32 %i15 +define hidden i32 @calculate_wid_x_commute_mul() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i8, %i + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +; TODO: Could handle this case +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_nonuniform_workgroup_size': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: DIVERGENT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: DIVERGENT: ret i32 %i15 +define hidden i32 @calculate_wid_x_nonuniform_workgroup_size() #3 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_wrong_offset': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: DIVERGENT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: DIVERGENT: ret i32 %i15 +define hidden i32 @calculate_wid_x_wrong_offset() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 6 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_udiv': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = udiv i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_wid_x_udiv() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = udiv i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_lid_x_urem': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: %i15 = urem i32 %i14, 32 +; CHECK-NEXT: ret i32 %i15 +define hidden i32 @calculate_lid_x_urem() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = urem i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_wrong_wave_size': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: DIVERGENT: %i15 = sdiv i32 %i14, 64 +; CHECK-NEXT: DIVERGENT: ret i32 %i15 +define hidden i32 @calculate_wid_x_wrong_wave_size() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 64 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_wrong_group_intrinsic': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: DIVERGENT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: DIVERGENT: ret i32 %i15 +define hidden i32 @calculate_wid_x_wrong_group_intrinsic() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.y() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = mul i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +; CHECK-LABEL: 'Divergence Analysis' for function 'calculate_wid_x_not_mul': +; CHECK: DIVERGENT: %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; CHECK-NEXT: DIVERGENT: %i14 = add i32 %i12, %i13 +; CHECK-NEXT: DIVERGENT: %i15 = sdiv i32 %i14, 32 +; CHECK-NEXT: DIVERGENT: ret i32 %i15 +define hidden i32 @calculate_wid_x_not_mul() #0 { +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i1, i64 12 + %i6 = bitcast i8 addrspace(4)* %i5 to i32 addrspace(4)* + %i7 = load i32, i32 addrspace(4)* %i6, align 4 + %i8 = zext i16 %i4 to i32 + %i9 = mul i32 %i, %i8 + %i10 = sub i32 %i7, %i9 + %i11 = tail call i32 @llvm.umin.i32(i32 %i10, i32 %i8) + %i12 = shl i32 %i11, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 32 + ret i32 %i15 +} + +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.y() #1 +declare i32 @llvm.amdgcn.workgroup.id.z() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workitem.id.y() #1 +declare i32 @llvm.amdgcn.workitem.id.z() #1 +declare i32 @llvm.umin.i32(i32, i32) #2 + +attributes #0 = { mustprogress nofree noinline nosync nounwind readnone willreturn "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nocallback nofree nosync nounwind readnone speculatable willreturn } +attributes #3 = { mustprogress nofree noinline nosync nounwind readnone willreturn "uniform-work-group-size"="false" } + +!0 = !{i16 1, i16 1025} +!1 = !{} +!2 = !{i32 0, i32 1024} Index: llvm/test/CodeGen/AMDGPU/wave-id-computation.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/wave-id-computation.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; Specially recognize workitem ID division by wavesize as uniform. Issue 54010. + +; With the sdiv recognized as uniform, the load from %gep.in should be turned into an s_load_dword. +define amdgpu_kernel void @calculate_wid_x(i32 addrspace(1)* noalias %out.ptr, i32 addrspace(1)* noalias %in.ptr) #0 { +; CHECK-LABEL: calculate_wid_x: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_dword s9, s[4:5], 0x4 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_and_b32 s4, s9, 0xffff +; CHECK-NEXT: s_mul_i32 s4, s4, s8 +; CHECK-NEXT: v_add_u32_e32 v0, s4, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 26, v1 +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 6, v0 +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: s_ashr_i32 s5, s4, 31 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; CHECK-NEXT: s_add_u32 s2, s2, s4 +; CHECK-NEXT: s_addc_u32 s3, s3, s5 +; CHECK-NEXT: s_load_dword s2, s[2:3], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s4 +; CHECK-NEXT: s_addc_u32 s1, s1, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i8 = zext i16 %i4 to i32 + %i12 = mul i32 %i8, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 64 + %gep.in = getelementptr i32, i32 addrspace(1)* %in.ptr, i32 %i15 + %load = load i32, i32 addrspace(1)* %gep.in + %gep.out = getelementptr i32, i32 addrspace(1)* %out.ptr, i32 %i15 + store i32 %load, i32 addrspace(1)* %gep.out + ret void +} + +define amdgpu_kernel void @calculate_wid_x_vgpr_user(i32 addrspace(1)* noalias %out.ptr) #0 { +; CHECK-LABEL: calculate_wid_x_vgpr_user: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x4 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_and_b32 s2, s2, 0xffff +; CHECK-NEXT: s_mul_i32 s2, s2, s8 +; CHECK-NEXT: v_add_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 26, v1 +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 6, v0 +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_addc_u32 s1, s1, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: global_store_dword v0, v0, s[0:1] +; CHECK-NEXT: s_endpgm +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i8 = zext i16 %i4 to i32 + %i12 = mul i32 %i8, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 64 + %gep.out = getelementptr i32, i32 addrspace(1)* %out.ptr, i32 %i15 + store i32 0, i32 addrspace(1)* %gep.out + ret void +} + +define hidden i32 @calculate_wid_x_commute() #0 { +; CHECK-LABEL: calculate_wid_x_commute: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x4 +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_and_b32 s4, s4, 0xffff +; CHECK-NEXT: s_mul_i32 s12, s12, s4 +; CHECK-NEXT: v_add_u32_e32 v0, s12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 26, v1 +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 6, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i8 = zext i16 %i4 to i32 + %i12 = mul i32 %i, %i8 + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 64 + ret i32 %i15 +} + +@ptr_table = external hidden unnamed_addr addrspace(4) constant [0 x void(i32)*], align 8 + +; This does not require a waterfall loop if the sdiv is treated as uniform +define amdgpu_kernel void @calculate_wid_x_reqd_uniform_user(float addrspace(1)* noalias %out.ptr, i32 addrspace(1)* noalias %in.ptr) #0 { +; CHECK-LABEL: calculate_wid_x_reqd_uniform_user: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_load_dword s14, s[4:5], 0x4 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_and_b32 s14, s14, 0xffff +; CHECK-NEXT: s_mul_i32 s14, s14, s12 +; CHECK-NEXT: v_add_u32_e32 v3, s14, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; CHECK-NEXT: v_lshrrev_b32_e32 v4, 26, v4 +; CHECK-NEXT: v_add_u32_e32 v3, v3, v4 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 6, v3 +; CHECK-NEXT: v_readfirstlane_b32 s14, v3 +; CHECK-NEXT: s_ashr_i32 s15, s14, 31 +; CHECK-NEXT: s_lshl_b64 s[14:15], s[14:15], 3 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, ptr_table@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, ptr_table@rel32@hi+12 +; CHECK-NEXT: s_add_u32 s14, s14, s18 +; CHECK-NEXT: s_addc_u32 s15, s15, s19 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; CHECK-NEXT: s_add_u32 s8, s8, 16 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x7b +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_endpgm +bb: + %i = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i1 = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %i2 = getelementptr i8, i8 addrspace(4)* %i1, i64 4 + %i3 = bitcast i8 addrspace(4)* %i2 to i16 addrspace(4)* + %i4 = load i16, i16 addrspace(4)* %i3, align 4, !range !0, !invariant.load !1 + %i8 = zext i16 %i4 to i32 + %i12 = mul i32 %i8, %i + %i13 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 + %i14 = add i32 %i12, %i13 + %i15 = sdiv i32 %i14, 64 + %gep = getelementptr [0 x void(i32)*], [0 x void(i32)*] addrspace(4)* @ptr_table, i32 0, i32 %i15 + %fptr = load void(i32)*, void(i32)* addrspace(4)* %gep + call void %fptr(i32 123) + ret void +} + +define amdgpu_kernel void @compute_both(i32 addrspace(1)* noalias %out.ptr, i32 addrspace(1)* noalias %in.ptr0, i32 addrspace(1)* noalias %in.ptr1) #0 { +; CHECK-LABEL: compute_both: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s9, s[4:5], 0x4 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x10 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_and_b32 s4, s9, 0xffff +; CHECK-NEXT: s_mul_i32 s8, s8, s4 +; CHECK-NEXT: v_add_u32_e32 v0, s8, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 26, v1 +; CHECK-NEXT: v_add_u32_e32 v1, v0, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 0xffffffc0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 6, v1 +; CHECK-NEXT: v_sub_u32_e32 v0, v0, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v4, s1 +; CHECK-NEXT: v_add_co_u32_e32 v3, vcc, s0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CHECK-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; CHECK-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s3 +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; CHECK-NEXT: global_load_dword v2, v[3:4], off +; CHECK-NEXT: global_load_dword v5, v[0:1], off +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v1, v2, v5 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %gid.x = call i32 @llvm.amdgcn.workgroup.id.x() + %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4 + %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)* + %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4 + %group.size.x.ext = zext i16 %group.size.x to i32 + %mul = mul i32 %gid.x, %group.size.x.ext + %tid = add i32 %mul, %id.x + %wid = sdiv i32 %tid, 64 + %lid = srem i32 %tid, 64 + %gep0 = getelementptr i32, i32 addrspace(1)* %in.ptr0, i32 %wid + %gep1 = getelementptr i32, i32 addrspace(1)* %in.ptr1, i32 %lid + %load0 = load i32, i32 addrspace(1)* %gep0 + %load1 = load i32, i32 addrspace(1)* %gep1 + %add = add i32 %load0, %load1 + store i32 %add, i32 addrspace(1)* %out.ptr + ret void +} + +declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workgroup.id.y() #1 +declare i32 @llvm.amdgcn.workgroup.id.z() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i32 @llvm.amdgcn.workitem.id.y() #1 +declare i32 @llvm.amdgcn.workitem.id.z() #1 +declare i32 @llvm.umin.i32(i32, i32) #2 + +attributes #0 = { mustprogress nofree noinline nosync nounwind readnone willreturn "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { nocallback nofree nosync nounwind readnone speculatable willreturn } + +!0 = !{i16 1, i16 1025} +!1 = !{} +!2 = !{i32 0, i32 1024}