Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -14901,6 +14901,8 @@ } LLVM_FALLTHROUGH; } + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane); default: return nullptr; } Index: clang/test/CodeGenOpenCL/builtins-amdgcn.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -291,8 +291,8 @@ *out = __builtin_amdgcn_ds_bpermute(a, b); } -// CHECK-LABEL: @test_readfirstlane -// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a) +// CHECK-LABEL: @test_readfirstlane( +// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a) void test_readfirstlane(global int* out, int a) { *out = __builtin_amdgcn_readfirstlane(a); Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1409,8 +1409,7 @@ [IntrNoMem, IntrConvergent, IntrWillReturn]>; def int_amdgcn_readfirstlane : - GCCBuiltin<"__builtin_amdgcn_readfirstlane">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn]>; // The lane argument must be uniform across the currently active threads of the Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -47,11 +47,13 @@ const GCNSubtarget *ST; bool IsPixelShader; + Function *Readfirstlane = nullptr; + Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, - bool ValDivergent) const; + bool ValDivergent); public: static char ID; @@ -66,6 +68,16 @@ AU.addRequired(); } + Function *getReadfirstlane(Module *M) { + if (!Readfirstlane) { + Readfirstlane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readfirstlane, + {Type::getInt32Ty(M->getContext())}); + } + + return Readfirstlane; + } + void visitAtomicRMWInst(AtomicRMWInst &I); void visitIntrinsicInst(IntrinsicInst &I); }; @@ -99,6 +111,7 @@ optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); } + Readfirstlane = nullptr; ToReplace.clear(); return Changed; @@ -406,8 +419,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, - unsigned ValIdx, - bool ValDivergent) const { + unsigned ValIdx, bool ValDivergent) { // Start building just before the instruction. IRBuilder<> B(&I); @@ -588,23 +600,23 @@ // lane) to all other lanes in the wavefront. We use an intrinsic for this, // but have to handle 64-bit broadcasts with two calls to this intrinsic. Value *BroadcastI = nullptr; + Module *M = B.GetInsertBlock()->getModule(); if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); - CallInst *const ReadFirstLaneLo = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - CallInst *const ReadFirstLaneHi = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); + Function *Readfirstlane = getReadfirstlane(M); + + CallInst *const ReadFirstLaneLo = B.CreateCall(Readfirstlane, ExtractLo); + CallInst *const ReadFirstLaneHi = B.CreateCall(Readfirstlane, ExtractHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); Value *const Insert = B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); + BroadcastI = B.CreateCall(getReadfirstlane(M), PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -655,6 +655,17 @@ if (match(Src, PatternMatch::m_Intrinsic())) { return IC.replaceInstUsesWith(II, Src); } + + // readfirstlane (bitcast x) -> bitcast (readfirstlane x) + Value *BitcastInput = nullptr; + if (match(Src, + PatternMatch::m_BitCast(PatternMatch::m_Value(BitcastInput)))) { + CallInst *NewCall = + IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, + {BitcastInput->getType()}, BitcastInput); + Value *NewCast = IC.Builder.CreateBitCast(NewCall, II.getType()); + return IC.replaceInstUsesWith(II, NewCast); + } } else { // readlane (readlane x, y), y -> readlane x, y if (match(Src, PatternMatch::m_Intrinsic( Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4730,6 +4730,19 @@ } return; } + case Intrinsic::amdgcn_readfirstlane: { + // Hack out illegal types. + EVT VT = N->getValueType(0); + if (VT.getSizeInBits() != 32) + return; + + SDLoc DL(N); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, N->getOperand(1)); + SDValue Readlane = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + N->getOperand(0), Cast); + Results.push_back(DAG.getNode(ISD::BITCAST, DL, VT, Readlane)); + return; + } } break; } Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2222,10 +2222,18 @@ // FIXME: Should also do this for readlane, but tablegen crashes on // the ignored src1. def : GCNPat< - (int_amdgcn_readfirstlane (i32 imm:$src)), + (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), (S_MOV_B32 SReg_32:$src) >; +// FIXME: Should have matcher for any size +foreach vt = Reg32Types.types in { + def : GCNPat< + (vt (int_amdgcn_readfirstlane (vt VRegOrLds_32:$src))), + (V_READFIRSTLANE_B32 VRegOrLds_32:$src) + >; +} + multiclass BFMPatterns { def : GCNPat < (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.readfirstlane.mir @@ -1,11 +1,11 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel.*' %s -o - 2> %t | FileCheck -check-prefix=GCN %s # RUN: FileCheck -check-prefix=ERR %s < %t -# ERR: remark: :0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s) +# ERR: remark: :0:0: cannot select: %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0:sgpr(s32) (in function: readfirstlane_s32_s) --- -name: readfirstlane_v +name: readfirstlane_s32_v legalized: true regBankSelected: true tracksRegLiveness: true @@ -13,7 +13,7 @@ body: | bb.0: liveins: $vgpr0 - ; GCN-LABEL: name: readfirstlane_v + ; GCN-LABEL: name: readfirstlane_s32_v ; GCN: liveins: $vgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec @@ -24,7 +24,7 @@ ... --- -name: readfirstlane_v_imm +name: readfirstlane_v_s32_imm legalized: true regBankSelected: true tracksRegLiveness: true @@ -32,7 +32,7 @@ body: | bb.0: - ; GCN-LABEL: name: readfirstlane_v_imm + ; GCN-LABEL: name: readfirstlane_v_s32_imm ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_]] ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 [[COPY]] @@ -42,9 +42,85 @@ S_ENDPGM 0, implicit %1 ... +--- +name: readfirstlane_v2s16_v +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: readfirstlane_v2s16_v + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]] + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:sgpr(<2 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + S_ENDPGM 0, implicit %1 +... + +--- +name: readfirstlane_p3_v +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: readfirstlane_p3_v + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]] + %0:vgpr(p3) = COPY $vgpr0 + %1:sgpr(p3) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + S_ENDPGM 0, implicit %1 +... + +--- +name: readfirstlane_p5_v +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: readfirstlane_p5_v + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]] + %0:vgpr(p5) = COPY $vgpr0 + %1:sgpr(p5) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + S_ENDPGM 0, implicit %1 +... + +--- +name: readfirstlane_p2_v +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: readfirstlane_p2_v + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_READFIRSTLANE_B32_]] + %0:vgpr(p2) = COPY $vgpr0 + %1:sgpr(p2) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %0 + S_ENDPGM 0, implicit %1 +... + # Make sure this fails to select --- -name: readfirstlane_s +name: readfirstlane_s32_s legalized: true regBankSelected: true tracksRegLiveness: true @@ -52,7 +128,7 @@ body: | bb.0: liveins: $sgpr0 - ; GCN-LABEL: name: readfirstlane_s + ; GCN-LABEL: name: readfirstlane_s32_s ; GCN: liveins: $sgpr0 ; GCN: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GCN: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,11 +1,16 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 +declare i32 @llvm.amdgcn.readfirstlane.i32(i32) #0 +declare float @llvm.amdgcn.readfirstlane.f32(float) #0 +declare <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half>) #0 +declare <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16>) #0 +declare i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)*) #0 -; CHECK-LABEL: {{^}}test_readfirstlane: +; CHECK-LABEL: {{^}}test_readfirstlane_i32: ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 -define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) +define void @test_readfirstlane_i32(i32 addrspace(1)* %out, i32 %src) #1 { + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -15,7 +20,7 @@ ; CHECK-NOT: [[SGPR_VAL]] ; CHECK: ; use [[SGPR_VAL]] define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } @@ -25,7 +30,7 @@ ; CHECK-NOT: [[VVAL]] ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 { - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 32) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -36,7 +41,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %m0) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -51,7 +56,7 @@ ; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 { %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %sgpr) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } @@ -62,10 +67,42 @@ define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 { %alloca = alloca i32, addrspace(5) %int = ptrtoint i32 addrspace(5)* %alloca to i32 - %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %int) call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) ret void } +; CHECK-LABEL: {{^}}test_readfirstlane_f32: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +define void @test_readfirstlane_f32(float addrspace(1)* %out, float %src) #1 { + %readfirstlane = call float @llvm.amdgcn.readfirstlane.f32(float %src) + store float %readfirstlane, float addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2f16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +define void @test_readfirstlane_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %src) #1 { + %readfirstlane = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src) + store <2 x half> %readfirstlane, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_v2i16: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +define void @test_readfirstlane_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %src) #1 { + %readfirstlane = call <2 x i16> @llvm.amdgcn.readfirstlane.v2i16(<2 x i16> %src) + store <2 x i16> %readfirstlane, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_p3: +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +define void @test_readfirstlane_p3(i8 addrspace(3)* addrspace(1)* %out, i8 addrspace(3)* %src) #1 { + %readfirstlane = call i8 addrspace(3)* @llvm.amdgcn.readfirstlane.p3i8(i8 addrspace(3)* %src) + store i8 addrspace(3)* %readfirstlane, i8 addrspace(3)* addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll =================================================================== --- llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2507,7 +2507,7 @@ define amdgpu_kernel void @readfirstlane_constant(i32 %arg) { ; CHECK-LABEL: @readfirstlane_constant( -; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[VAR:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: store volatile i32 [[VAR]], i32* undef, align 4 ; CHECK-NEXT: store volatile i32 0, i32* undef, align 4 ; CHECK-NEXT: store volatile i32 123, i32* undef, align 4 @@ -2530,7 +2530,7 @@ define i32 @readfirstlane_idempotent(i32 %arg) { ; CHECK-LABEL: @readfirstlane_idempotent( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2541,7 +2541,7 @@ define i32 @readfirstlane_readlane(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2552,10 +2552,10 @@ define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readfirstlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2573,7 +2573,7 @@ ; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[READ0]]) ; CHECK-NEXT: ret i32 [[READ1]] ; bb0: @@ -2585,6 +2585,41 @@ ret i32 %read1 } +define i32 @readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: @readfirstlane_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + +define float @bitcast_readfirstlane_bitcast(float %arg) { +; CHECK-LABEL: @bitcast_readfirstlane_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG:%.*]]) +; CHECK-NEXT: ret float [[TMP1]] +; + %bitcast.arg = bitcast float %arg to i32 + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + %cast.read = bitcast i32 %read to float + ret float %cast.read +} + +define i32 @readfirstlane_bitcast_multi_use(float %arg) { +; CHECK-LABEL: @readfirstlane_bitcast_multi_use( +; CHECK-NEXT: store float [[ARG:%.*]], float* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %bitcast.arg = bitcast float %arg to i32 + store i32 %bitcast.arg, i32* undef + %read = call i32 @llvm.amdgcn.readfirstlane(i32 %bitcast.arg) + ret i32 %read +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.readlane ; -------------------------------------------------------------------- @@ -2637,7 +2672,7 @@ define i32 @readlane_readfirstlane(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane( -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: ret i32 [[READ0]] ; %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) @@ -2667,7 +2702,7 @@ define i32 @readlane_readfirstlane_different_block(i32 %arg) { ; CHECK-LABEL: @readlane_readfirstlane_different_block( ; CHECK-NEXT: bb0: -; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG:%.*]]) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0)