Index: lib/Target/AMDGPU/GCNDPPCombine.cpp =================================================================== --- lib/Target/AMDGPU/GCNDPPCombine.cpp +++ lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -331,7 +331,7 @@ auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); assert(DstOpnd && DstOpnd->isReg()); auto DPPMovReg = DstOpnd->getReg(); - if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg, MovMI)) { + if (execMayBeModifiedBeforeUse(*MRI, DPPMovReg)) { LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same" " for all uses\n"); return false; Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -436,9 +436,12 @@ unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = RSUse; + ++Next; MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) @@ -518,11 +521,13 @@ if (FoldingImm) { if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), - *OpToFold.getParent(), UseMI)) return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; @@ -531,7 +536,6 @@ if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { if (execMayBeModifiedBeforeUse(*MRI, UseMI->getOperand(UseOpIdx).getReg(), - *OpToFold.getParent(), UseMI)) return; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -983,7 +983,6 @@ /// blocks. bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, unsigned VReg, - const MachineInstr &DefMI, const MachineInstr *UseMI = nullptr); namespace AMDGPU { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6076,13 +6076,12 @@ bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, unsigned VReg, - const MachineInstr &DefMI, const MachineInstr *UseMI) { assert(MRI.isSSA() && "Must be run on SSA"); - assert(DefMI.definesRegister(VReg) && "wrong def instruction"); + const MachineInstr *DefMI = MRI.getVRegDef(VReg); auto *TRI = MRI.getTargetRegisterInfo(); - auto *DefBB = DefMI.getParent(); + auto *DefBB = DefMI->getParent(); if (UseMI) { // Don't bother searching between blocks, although it is possible this block @@ -6107,7 +6106,7 @@ // Stop scan at the use if known. auto E = UseMI ? UseMI->getIterator() : DefBB->end(); - for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { + for (auto I = std::next(DefMI->getIterator()); I != E; ++I) { if (I->isDebugInstr()) continue; Index: test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -279,12 +279,24 @@ ret float %r2 } +; CHECK-LABEL: {{^}}vgpr_arg_src: +; CHECK: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0 +; CHECK: s_mov_b32 s[[ZERO:[0-9]+]] +; CHECK: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[READLANE]]:[[ZERO]]{{\]}} +define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) { +main_body: + %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg + %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1 + ret float %tmp10 +} + ; Function Attrs: nounwind readnone speculatable declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #7 !0 = !{} Index: test/CodeGen/AMDGPU/fold-readlane.mir =================================================================== --- test/CodeGen/AMDGPU/fold-readlane.mir +++ test/CodeGen/AMDGPU/fold-readlane.mir @@ -248,3 +248,126 @@ %1:sreg_32_xm0 = S_MOV_B32 12 %2:sreg_32_xm0 = V_READLANE_B32 %0, %1, implicit $exec ... + +# Constant for subreg0 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence0{{$}} + +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Constant for subreg1 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence1{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, killed %0, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec + +--- +name: fold-imm-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, killed %0:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Different constant regs for each subreg +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence2{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 1 +--- +name: fold-imm-readfirstlane-regsequence2 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Same constant reg for each subreg, so there are multiple constant uses +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence3{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence3 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# FIXME: This should fold +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence0{{$}} +# GCN: %0:vgpr_32 = COPY $sgpr10 +# GCN-NEXT: %1:vgpr_32 = COPY $sgpr11 +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:vgpr_32 = COPY $sgpr10 + %1:vgpr_32 = COPY $sgpr11 + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence1{{$}} +# GCN: %0:sreg_32_xm0 = COPY $sgpr10 +# GCN-NEXT: %1:sreg_32_xm0 = COPY $sgpr11 +# GCN-NEXT: %2:vgpr_32 = COPY %0 +# GCN-NEXT: %3:vgpr_32 = COPY %1 +# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1 +# GCN-NEXT: %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec +# GCN-NEXT: %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:sreg_32_xm0 = COPY $sgpr10 + %1:sreg_32_xm0 = COPY $sgpr11 + %2:vgpr_32 = COPY %0 + %3:vgpr_32 = COPY %1 + %4:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, killed %3:vgpr_32, %subreg.sub1 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec +...