Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -436,9 +436,11 @@ unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) @@ -523,6 +525,9 @@ return; UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6079,7 +6079,6 @@ const MachineInstr &DefMI, const MachineInstr *UseMI) { assert(MRI.isSSA() && "Must be run on SSA"); - assert(DefMI.definesRegister(VReg) && "wrong def instruction"); auto *TRI = MRI.getTargetRegisterInfo(); auto *DefBB = DefMI.getParent(); Index: test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -279,12 +279,24 @@ ret float %r2 } +; CHECK-LABEL: {{^}}vgpr_arg_src: +; CHECK: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0 +; CHECK: s_mov_b32 s[[ZERO:[0-9]+]] +; CHECK: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[READLANE]]:[[ZERO]]{{\]}} +define amdgpu_vs float @vgpr_arg_src(<4 x i32> addrspace(6)* %arg) { +main_body: + %tmp9 = load <4 x i32>, <4 x i32> addrspace(6)* %arg + %tmp10 = call nsz float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp9, i32 undef, i32 0, i32 0, i32 0) #1 + ret float %tmp10 +} + ; Function Attrs: nounwind readnone speculatable declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6 ; Function Attrs: nounwind readonly declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7 +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #7 !0 = !{} Index: test/CodeGen/AMDGPU/fold-readlane.mir =================================================================== --- test/CodeGen/AMDGPU/fold-readlane.mir +++ test/CodeGen/AMDGPU/fold-readlane.mir @@ -248,3 +248,126 @@ %1:sreg_32_xm0 = S_MOV_B32 12 %2:sreg_32_xm0 = V_READLANE_B32 %0, %1, implicit $exec ... + +# Constant for subreg0 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence0{{$}} + +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Constant for subreg1 +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence1{{$}} +# GCN: %0:vgpr_32 = COPY $vgpr0 +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, killed %0, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec + +--- +name: fold-imm-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, killed %0:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Different constant regs for each subreg +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence2{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 1 +--- +name: fold-imm-readfirstlane-regsequence2 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# Same constant reg for each subreg, so there are multiple constant uses +# GCN-LABEL: name: fold-imm-readfirstlane-regsequence3{{$}} +# GCN: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = S_MOV_B32 0 +# GCN-NEXT: %4:sgpr_32 = S_MOV_B32 0 +--- +name: fold-imm-readfirstlane-regsequence3 +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# FIXME: This should fold +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence0{{$}} +# GCN: %0:vgpr_32 = COPY $sgpr10 +# GCN-NEXT: %1:vgpr_32 = COPY $sgpr11 +# GCN-NEXT: %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, killed %1, %subreg.sub1 +# GCN-NEXT: %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0, implicit $exec +# GCN-NEXT: %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:vgpr_32 = COPY $sgpr10 + %1:vgpr_32 = COPY $sgpr11 + %2:vreg_64 = REG_SEQUENCE %0:vgpr_32, %subreg.sub0, killed %1:vgpr_32, %subreg.sub1 + %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64, implicit $exec + %4:sgpr_32 = V_READFIRSTLANE_B32 %2.sub1:vreg_64, implicit $exec +... + +# GCN-LABEL: name: fold-copy-readfirstlane-regsequence1{{$}} +# GCN: %0:sreg_32_xm0 = COPY $sgpr10 +# GCN-NEXT: %1:sreg_32_xm0 = COPY $sgpr11 +# GCN-NEXT: %2:vgpr_32 = COPY %0 +# GCN-NEXT: %3:vgpr_32 = COPY %1 +# GCN-NEXT: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, killed %3, %subreg.sub1 +# GCN-NEXT: %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0, implicit $exec +# GCN-NEXT: %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1, implicit $exec +--- +name: fold-copy-readfirstlane-regsequence1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr10, $sgpr11 + %0:sreg_32_xm0 = COPY $sgpr10 + %1:sreg_32_xm0 = COPY $sgpr11 + %2:vgpr_32 = COPY %0 + %3:vgpr_32 = COPY %1 + %4:vreg_64 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, killed %3:vgpr_32, %subreg.sub1 + %5:sgpr_32 = V_READFIRSTLANE_B32 %4.sub0:vreg_64, implicit $exec + %6:sgpr_32 = V_READFIRSTLANE_B32 %4.sub1:vreg_64, implicit $exec +...