Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -515,6 +515,41 @@ return; } + unsigned UseOpc = UseMI->getOpcode(); + if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || + (UseOpc == AMDGPU::V_READLANE_B32 && + (int)UseOpIdx == + AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { + // %vgpr = V_MOV_B32 imm + // %sgpr = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr = S_MOV_B32 imm + if (FoldingImm) { + if (!isEXECMaskConstantBetweenDefAndUses( + UseMI->getOperand(UseOpIdx).getReg(), *MRI)) + return; + + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + + if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { + if (!isEXECMaskConstantBetweenDefAndUses( + UseMI->getOperand(UseOpIdx).getReg(), *MRI)) + return; + + // %vgpr = COPY %sgpr0 + // %sgpr1 = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr1 = COPY %sgpr0 + UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + } + const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1696,6 +1696,13 @@ (S_SUB_I32 $src0, NegSubInlineConst32:$src1) >; +// Avoid pointlessly materializing a constant in VGPR. +// FIXME: Should also do this for readlane, but tablegen crashes on +// the ignored src1. +def : GCNPat< + (int_amdgcn_readfirstlane (i32 imm:$src)), + (S_MOV_B32 $src) +>; multiclass BFMPatterns { def : GCNPat < Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3782,20 +3782,27 @@ if (Constant *C = dyn_cast(II->getArgOperand(0))) return replaceInstUsesWith(*II, C); + // The rest of these may not be safe if the exec may not be the same between + // the def and use. + Value *Src = II->getArgOperand(0); + Instruction *SrcInst = dyn_cast(Src); + if (SrcInst && SrcInst->getParent() != II->getParent()) + break; + // readfirstlane (readfirstlane x) -> readfirstlane x // readlane (readfirstlane x), y -> readfirstlane x - if (match(II->getArgOperand(0), m_Intrinsic())) - return replaceInstUsesWith(*II, II->getArgOperand(0)); + if (match(Src, m_Intrinsic())) + return replaceInstUsesWith(*II, Src); if (IID == Intrinsic::amdgcn_readfirstlane) { // readfirstlane (readlane x, y) -> readlane x, y - if (match(II->getArgOperand(0), m_Intrinsic())) - return replaceInstUsesWith(*II, II->getArgOperand(0)); + if (match(Src, m_Intrinsic())) + return replaceInstUsesWith(*II, Src); } else { // readlane (readlane x, y), y -> readlane x, y - if (match(II->getArgOperand(0), m_Intrinsic( + if (match(Src, m_Intrinsic( m_Value(), m_Specific(II->getArgOperand(1))))) - return replaceInstUsesWith(*II, II->getArgOperand(0)); + return replaceInstUsesWith(*II, Src); } break; Index: test/CodeGen/AMDGPU/fold-readlane.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fold-readlane.mir @@ -0,0 +1,205 @@ +# RUN: llc -march=amdgcn -run-pass si-fold-operands -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: fold-imm-readfirstlane{{$}} +# GCN: %1:sreg_32_xm0 = S_MOV_B32 123 +--- +name: fold-imm-readfirstlane +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +... + +# GCN-LABEL: name: fold-imm-readfirstlane-readfirstlane{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec +# GCN: %1:sreg_32_xm0 = S_MOV_B32 123 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 123, implicit $exec +# GCN: %3:sreg_32_xm0 = S_MOV_B32 123 + +--- +name: fold-imm-readfirstlane-readfirstlane +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:vgpr_32 = COPY %1 + %3:sreg_32_xm0 = V_READFIRSTLANE_B32 %2, implicit $exec + +... + +# GCN-LABEL: name: no-fold-imm-readfirstlane-physreg{{$}} +# GCN: $vgpr0 = V_MOV_B32_e32 123, implicit $exec +# GCN-NEXT: V_READFIRSTLANE_B32 $vgpr0, implicit $exec + +--- +name: no-fold-imm-readfirstlane-physreg +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 123, implicit $exec + %0:sreg_32_xm0 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec +... + +# TODO: This could be folded, if the search for exec modifications was +# smarter. + +# GCN-LABEL: name: fold-imm-readfirstlane-cross-block{{$}} +# GCN: V_MOV_B32 +# GCN: V_READFIRSTLANE_B32 +--- +name: fold-imm-readfirstlane-cross-block +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + + bb.1: + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +... + +# TODO: This could be folded, if the search for exec modifications was +# smarter. + +# GCN-LABEL: name: fold-copy-readfirstlane-cross-block{{$}} +# GCN: V_MOV_B32 +# GCN: V_READFIRSTLANE_B32 +--- +name: fold-copy-readfirstlane-cross-block +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr12 + %0:sreg_32_xm0 = COPY $sgpr12 + %1:vgpr_32 = V_MOV_B32_e32 %0, implicit $exec + + bb.1: + %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +# GCN-LABEL: name: fold-copy-readfirstlane-cross-block-exec-def{{$}} +# GCN: V_MOV_B32 +# GCN: $exec = S_MOV_B64_term +# GCN: V_READFIRSTLANE_B32 +--- +name: fold-copy-readfirstlane-cross-block-exec-def +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10_sgpr11, $sgpr12 + %0:sreg_32_xm0 = COPY $sgpr12 + %1:vgpr_32 = V_MOV_B32_e32 %0, implicit $exec + $exec = S_MOV_B64_term $sgpr10_sgpr11 + + bb.1: + %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %1, implicit $exec +... + +# GCN-LABEL: name: fold-copy-readfirstlane-same-block-exec-def{{$}} +# GCN: COPY +# GCN-NEXT: %1:vgpr_32 = COPY %0 +# GCN-NEXT: $exec = S_MOV_B64 +# GCN-NEXT: V_READFIRSTLANE_B32 +--- +name: fold-copy-readfirstlane-same-block-exec-def +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10_sgpr11, $sgpr12 + %0:sreg_32_xm0 = COPY $sgpr12 + %1:vgpr_32 = COPY %0, implicit $exec + $exec = S_MOV_B64 $sgpr10_sgpr11 + %2:sreg_32_xm0 = V_READFIRSTLANE_B32 %1, implicit $exec + +... + +# GCN-LABEL: name: fold-imm-readfirstlane-cross-block-exec-def{{$}} +# GCN: V_MOV_B32 +# GCN: $exec = S_MOV_B64 +# GCN: V_READFIRSTLANE_B32 + +--- +name: fold-imm-readfirstlane-cross-block-exec-def +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10_sgpr11, $sgpr12_sgpr13 + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + $exec = S_MOV_B64_term $sgpr10_sgpr11 + + bb.1: + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +... + +# GCN-LABEL: name: fold-imm-readfirstlane-same-block-exec-def{{$}} +# GCN: V_MOV_B32 +# GCN-NEXT: $exec = S_MOV_B64 +# GCN-NEXT: V_READFIRSTLANE_B32 +--- +name: fold-imm-readfirstlane-same-block-exec-def +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10_sgpr11 + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + $exec = S_MOV_B64 $sgpr10_sgpr11 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + +... + +# GCN-LABEL: name: fold-sgpr-copy-readfirstlane-same-block-exec-def{{$}} +# GCN: COPY +# GCN-NEXT: $exec = S_MOV_B64 +# GCN-NEXT: V_READFIRSTLANE_B32 +--- +name: fold-sgpr-copy-readfirstlane-same-block-exec-def +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10_sgpr11, $sgpr12 + %0:vgpr_32 = COPY $sgpr12 + $exec = S_MOV_B64 $sgpr10_sgpr11 + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec +... + +# GCN-LABEL: name: fold-imm-readfirstlane-user{{$}} +# GCN: %3:sreg_32_xm0 = S_MOV_B32 123 +--- +name: fold-imm-readfirstlane-user +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + %2:sreg_32_xm0 = COPY %1 + %3:sreg_32_xm0 = COPY %2 + S_ENDPGM 0, implicit %3 +... + +# GCN-LABEL: name: fold-imm-readlane{{$}} +# GCN: %1:sreg_32_xm0 = S_MOV_B32 123 +--- +name: fold-imm-readlane +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + %1:sreg_32_xm0 = V_READLANE_B32 %0, 0, implicit $exec +... + +# GCN-LABEL: name: fold-imm-readlane-src1{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN: V_READLANE_B32 %0, 12, implicit $exec +--- +name: fold-imm-readlane-src1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32_xm0 = S_MOV_B32 12 + %2:sreg_32_xm0 = V_READLANE_B32 %0, %1, implicit $exec +... Index: test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,19 +1,30 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare i32 @llvm.amdgcn.readfirstlane(i32) #0 ; CHECK-LABEL: {{^}}test_readfirstlane: -; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2 +define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void } ; CHECK-LABEL: {{^}}test_readfirstlane_imm: -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 -; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] +; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32 +; CHECK-NOT: [[SGPR_VAL]] +; CHECK: ; use [[SGPR_VAL]] define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 { + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) + call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) + ret void +} + +; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold: +; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 +; CHECK-NOT: [[VVAL]] +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] +define amdgpu_kernel void @test_readfirstlane_imm_fold(i32 addrspace(1)* %out) #1 { %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32) store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 ret void @@ -24,7 +35,7 @@ ; CHECK: s_mov_b32 m0, -1 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] -; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]] +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0) @@ -32,5 +43,32 @@ ret void } +; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr: +; CHECK: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]] +; CHECK: ;;#ASMEND +; CHECK-NOT: [[SGPR]] +; CHECK-NOT: readfirstlane +; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]] +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 { + %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr) + store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure this doesn't crash. +; CHECK-LABEL: {{^}}test_readfirstlane_fi: +; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4 +; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]] +define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 { + %alloca = alloca i32, addrspace(5) + %int = ptrtoint i32 addrspace(5)* %alloca to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int) + call void asm sideeffect "; use $0", "s"(i32 %readfirstlane) + ret void +} + attributes #0 = { nounwind readnone convergent } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -1,18 +1,26 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare i32 @llvm.amdgcn.readlane(i32, i32) #0 -; CHECK-LABEL: {{^}}test_readlane_sreg: -; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} -define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 { +; CHECK-LABEL: {{^}}test_readlane_sreg_sreg: +; CHECK-NOT: v_readlane_b32 +define amdgpu_kernel void @test_readlane_sreg_sreg(i32 %src0, i32 %src1) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1) - store i32 %readlane, i32 addrspace(1)* %out, align 4 + call void asm sideeffect "; use $0", "s"(i32 %readlane) + ret void +} + +; CHECK-LABEL: {{^}}test_readlane_vreg_sreg: +; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +define amdgpu_kernel void @test_readlane_vreg_sreg(i32 %src0, i32 %src1) #1 { + %vgpr = call i32 asm sideeffect "; def $0", "=v"() + %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 %src1) + call void asm sideeffect "; use $0", "s"(i32 %readlane) ret void } ; CHECK-LABEL: {{^}}test_readlane_imm_sreg: -; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32 -; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} +; CHECK-NOT: v_readlane_b32 define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1) store i32 %readlane, i32 addrspace(1)* %out, align 4 @@ -38,7 +46,7 @@ ; CHECK: s_mov_b32 m0, -1 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]] -; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}} +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]] define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 { %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"() %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1) @@ -46,14 +54,30 @@ ret void } -; CHECK-LABEL: {{^}}test_readlane_imm: +; CHECK-LABEL: {{^}}test_readlane_vgpr_imm: ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32 -define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 { - %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0 +define amdgpu_kernel void @test_readlane_vgpr_imm(i32 addrspace(1)* %out) #1 { + %vgpr = call i32 asm sideeffect "; def $0", "=v"() + %readlane = call i32 @llvm.amdgcn.readlane(i32 %vgpr, i32 32) #0 store i32 %readlane, i32 addrspace(1)* %out, align 4 ret void } +; CHECK-LABEL: {{^}}test_readlane_copy_from_sgpr: +; CHECK: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]] +; CHECK: ;;#ASMEND +; CHECK-NOT: [[SGPR]] +; CHECK-NOT: readlane +; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]] +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]] +define amdgpu_kernel void @test_readlane_copy_from_sgpr(i32 addrspace(1)* %out) #1 { + %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() + %readfirstlane = call i32 @llvm.amdgcn.readlane(i32 %sgpr, i32 7) + store i32 %readfirstlane, i32 addrspace(1)* %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } Index: test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll =================================================================== --- test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -2483,6 +2483,42 @@ ret i32 %read1 } +define i32 @readfirstlane_readfirstlane_different_block(i32 %arg) { +; CHECK-LABEL: @readfirstlane_readfirstlane_different_block( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: ret i32 [[READ1]] +; +bb0: + %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) + br label %bb1 + +bb1: + %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0) + ret i32 %read1 +} + +define i32 @readfirstlane_readlane_different_block(i32 %arg) { +; CHECK-LABEL: @readfirstlane_readlane_different_block( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 0) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[READ0]]) +; CHECK-NEXT: ret i32 [[READ1]] +; +bb0: + %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 0) + br label %bb1 + +bb1: + %read1 = call i32 @llvm.amdgcn.readfirstlane(i32 %read0) + ret i32 %read1 +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.readlane ; -------------------------------------------------------------------- @@ -2543,6 +2579,43 @@ ret i32 %read1 } +define i32 @readlane_idempotent_different_block(i32 %arg, i32 %lane) { +; CHECK-LABEL: @readlane_idempotent_different_block( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[ARG:%.*]], i32 [[LANE:%.*]]) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 [[LANE]]) +; CHECK-NEXT: ret i32 [[READ1]] +; +bb0: + %read0 = call i32 @llvm.amdgcn.readlane(i32 %arg, i32 %lane) + br label %bb1 + +bb1: + %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 %lane) + ret i32 %read1 +} + + +define i32 @readlane_readfirstlane_different_block(i32 %arg) { +; CHECK-LABEL: @readlane_readfirstlane_different_block( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[READ0:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[ARG:%.*]]) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[READ1:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[READ0]], i32 0) +; CHECK-NEXT: ret i32 [[READ1]] +; +bb0: + %read0 = call i32 @llvm.amdgcn.readfirstlane(i32 %arg) + br label %bb1 + +bb1: + %read1 = call i32 @llvm.amdgcn.readlane(i32 %read0, i32 0) + ret i32 %read1 +} + ; -------------------------------------------------------------------- ; llvm.amdgcn.update.dpp.i32 ; --------------------------------------------------------------------