Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -94,10 +94,12 @@ MachineBasicBlock &LoopBB, MachineBasicBlock &RemainderBB, unsigned SaveReg, - unsigned IdxReg); + const MachineOperand &IdxReg); void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, - MachineInstr *MovRel, unsigned IdxReg, int Offset); + MachineInstr *MovRel, + const MachineOperand &IdxReg, + int Offset); bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); @@ -386,7 +388,7 @@ MachineBasicBlock &LoopBB, MachineBasicBlock &RemainderBB, unsigned SaveReg, - unsigned IdxReg) { + const MachineOperand &IdxReg) { LivePhysRegs RemainderLiveRegs(TRI); RemainderLiveRegs.addLiveOuts(MBB); @@ -399,29 +401,38 @@ RemainderLiveRegs.addReg(SaveReg); if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { - RemainderLiveRegs.addReg(Val->getReg()); - LoopBB.addLiveIn(Val->getReg()); + if (!Val->isUndef()) { + RemainderLiveRegs.addReg(Val->getReg()); + LoopBB.addLiveIn(Val->getReg()); + } + } + + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (unsigned Reg : RemainderLiveRegs) { + if (MRI.isAllocatable(Reg)) + RemainderBB.addLiveIn(Reg); } - for (unsigned Reg : RemainderLiveRegs) - RemainderBB.addLiveIn(Reg); - unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); - LoopBB.addLiveIn(SrcReg); - LoopBB.addLiveIn(IdxReg); + const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); + if (!Src->isUndef()) + LoopBB.addLiveIn(Src->getReg()); + + if (!IdxReg.isUndef()) + LoopBB.addLiveIn(IdxReg.getReg()); LoopBB.sortUniqueLiveIns(); } void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, MachineInstr *MovRel, - unsigned IdxReg, + const MachineOperand &IdxReg, int Offset) { MachineBasicBlock::iterator I = LoopBB.begin(); // Read the next variant into VCC (lower 32 bits) <- also loop target BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) - .addReg(IdxReg); + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); // Move index from VCC into M0 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) @@ -430,7 +441,7 @@ // Compare the just read M0 value to all possible Idx values BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) .addReg(AMDGPU::M0) - .addReg(IdxReg); + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); // Update EXEC, save the original EXEC value to VCC BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) @@ -461,16 +472,16 @@ DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - unsigned Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx)->getReg(); + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (AMDGPU::SReg_32RegClass.contains(Idx)) { + if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { if (Offset) { BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx) + .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) .addImm(Offset); } else { BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); + .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); } MBB.insert(I, MovRel); @@ -485,7 +496,7 @@ // Reading from a VGPR requires looping over all workitems in the wavefront. assert(AMDGPU::SReg_64RegClass.contains(Save) && - AMDGPU::VGPR_32RegClass.contains(Idx)); + AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); // Save the EXEC mask BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) @@ -505,13 +516,13 @@ LoopBB->addSuccessor(RemainderBB); if (TRI->trackLivenessAfterRegAlloc(MF)) - splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, Idx); + splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx); // Move the rest of the block into a new block. RemainderBB->transferSuccessors(&MBB); RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Idx, Offset); + emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -554,16 +565,16 @@ DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vec = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - computeIndirectRegAndOffset(Vec, Reg, Off); + computeIndirectRegAndOffset(SrcVec->getReg(), Reg, Off); MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg) - .addReg(Vec, RegState::Implicit); + .addReg(Reg, getUndefRegState(SrcVec->isUndef())) + .addReg(SrcVec->getReg(), RegState::Implicit); return loadM0(MI, MovRel, Off); } @@ -575,7 +586,7 @@ unsigned Dst = MI.getOperand(0).getReg(); int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)->getReg(); + MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); unsigned Reg; computeIndirectRegAndOffset(Dst, Reg, Off); @@ -583,7 +594,7 @@ MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) .addReg(Reg, RegState::Define) - .addReg(Val) + .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) .addReg(Dst, RegState::Implicit); return loadM0(MI, MovRel, Off); Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -94,6 +94,27 @@ ret void } +; CHECK-LABEL: {{^}}extract_undef_offset_sgpr: +define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +entry: + %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %value = extractelement <4 x i32> %ld, i32 undef + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_undef_offset_sgpr_vector_src: +; CHECK: buffer_load_dwordx4 +; CHECK: s_mov_b32 m0, +; CHECK-NEXT: v_movreld_b32 +define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +entry: + %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in + %value = insertelement <4 x i32> %ld, i32 5, i32 undef + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + ; CHECK-LABEL: {{^}}insert_w_offset: ; CHECK: s_mov_b32 m0 ; CHECK-NEXT: v_movreld_b32_e32 Index: test/CodeGen/AMDGPU/indirect-addressing-undef.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/indirect-addressing-undef.mir @@ -0,0 +1,327 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-lower-control-flow -o /dev/null %s 2>&1 | FileCheck %s +# Getting an undef that is specifically a VGPR is tricky from IR + +# CHECK-LABEL: name: extract_undef_offset_vgpr{{$}} +# CHECK: bb.1: +# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} + +# CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec +# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 +# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec + +# CHECK: bb.2: +# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}} + + +--- | + target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + + define void @extract_undef_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + entry: + %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %value = extractelement <4 x i32> %ld, i32 undef + store i32 %value, i32 addrspace(1)* %out + ret void + } + + define void @extract_undef_neg_offset_vgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + entry: + %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %value = extractelement <4 x i32> %ld, i32 undef + store i32 %value, i32 addrspace(1)* %out + ret void + } + + define void @insert_undef_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + entry: + %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in + %value = insertelement <4 x i32> %ld, i32 5, i32 undef + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void + } + + define void @insert_undef_neg_offset_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + entry: + %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in + %value = insertelement <4 x i32> %ld, i32 5, i32 undef + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void + } + + define void @insert_undef_value_offset_vgpr(<4 x i32> addrspace(1)*%out, <4 x i32> addrspace(1)* %in, i32 %idx) { + entry: + %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in + %value = insertelement <4 x i32> %ld, i32 undef, i32 %idx + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void + } + +... +--- +name: extract_undef_offset_vgpr +alignment: 0 +exposesReturnsTwice: false +hasInlineAsm: false +allVRegsAllocated: true +isSSA: false +tracksRegLiveness: true +tracksSubRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0_sgpr1 + + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + S_WAITCNT 127 + %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 + S_WAITCNT 3952 + %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec + S_WAITCNT 127 + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... + +# CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}} +# CHECK: bb.1: +# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} + +# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec +# CHECK: %m0 = S_MOV_B32 %vcc_lo +# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc +# CHECK: %vgpr0 = V_MOVRELS_B32_e32 %vgpr0, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 +# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec + +# CHECK: bb.2: +# CHECK: liveins: %sgpr6_sgpr7, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr4, %sgpr5, %sgpr6, %sgpr7, %sgpr4_sgpr5, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %sgpr0_sgpr1, %sgpr0, %sgpr1 + +name: extract_undef_neg_offset_vgpr +alignment: 0 +exposesReturnsTwice: false +hasInlineAsm: false +allVRegsAllocated: true +isSSA: false +tracksRegLiveness: true +tracksSubRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0_sgpr1 + + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + S_WAITCNT 127 + %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 + S_WAITCNT 3952 + %vgpr0, dead %sgpr0_sgpr1 = SI_INDIRECT_SRC_V4 killed %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec + S_WAITCNT 127 + BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec + S_ENDPGM + +... + +# CHECK-LABEL: name: insert_undef_offset_vgpr{{$}} +# CHECK: bb.1: +# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} + +# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec +# CHECK: %m0 = S_MOV_B32 %vcc_lo +# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 +# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec + +# CHECK: bb.2: +# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1 + +name: insert_undef_offset_vgpr +alignment: 0 +exposesReturnsTwice: false +hasInlineAsm: false +allVRegsAllocated: true +isSSA: false +tracksRegLiveness: true +tracksSubRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0_sgpr1 + + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + %vgpr4 = V_MOV_B32_e32 5, implicit %exec + S_WAITCNT 127 + %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in) + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) + S_WAITCNT 3952 + %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, 0, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec + S_WAITCNT 127 + BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out) + S_ENDPGM + +... + +# CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}} +# CHECK: bb.1: +# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} + +# CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec +# CHECK: %m0 = S_MOV_B32 %vcc_lo +# CHECK: %m0 = S_ADD_I32 %m0, -7, implicit-def %scc +# CHECK: %vgpr0 = V_MOVRELD_B32_e32 %vgpr4, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 +# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec + +# CHECK: bb.2: +# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}} + +name: insert_undef_neg_offset_vgpr +alignment: 0 +exposesReturnsTwice: false +hasInlineAsm: false +allVRegsAllocated: true +isSSA: false +tracksRegLiveness: true +tracksSubRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0_sgpr1 + + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + %vgpr4 = V_MOV_B32_e32 5, implicit %exec + S_WAITCNT 127 + %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in) + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) + S_WAITCNT 3952 + %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, undef %vgpr10, -7, killed %vgpr4, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec + S_WAITCNT 127 + BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out) + S_ENDPGM + +... + +# CHECK-LABEL: insert_undef_value_offset_vgpr{{$}} +# CHECK: bb.1: +# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} + +# CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec +# CHECK: %m0 = S_MOV_B32 %vcc_lo +# CHECK: %vgpr0 = V_MOVRELD_B32_e32 undef %vgpr10, implicit %m0, implicit %exec, implicit %vgpr0_vgpr1_vgpr2_vgpr3 +# CHECK: S_CBRANCH_EXECNZ %bb.1, implicit %exec + +# CHECK: bb.2: +# CHECK: liveins: %sgpr6_sgpr7, %sgpr7, %sgpr4_sgpr5, %sgpr5, %sgpr4_sgpr5_sgpr6_sgpr7, %sgpr6, %sgpr4, %vgpr0_vgpr1_vgpr2_vgpr3, %vgpr0, %vgpr1, %vgpr2, %vgpr3, %vgpr0_vgpr1, %vgpr2_vgpr3, %vgpr0_vgpr1_vgpr2, %vgpr1_vgpr2, %vgpr1_vgpr2_vgpr3, %vgpr4, %sgpr0_sgpr1, %sgpr0, %sgpr1{{$}} + +name: insert_undef_value_offset_vgpr +alignment: 0 +exposesReturnsTwice: false +hasInlineAsm: false +allVRegsAllocated: true +isSSA: false +tracksRegLiveness: true +tracksSubRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr0_sgpr1 + + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 -1 + %vgpr4 = V_MOV_B32_e32 2, implicit %exec + S_WAITCNT 127 + %vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (load 16 from %ir.in) + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 9 :: (non-temporal invariant load 8 from `i64 addrspace(2)* undef`) + S_WAITCNT 3952 + %vgpr0_vgpr1_vgpr2_vgpr3, dead %sgpr0_sgpr1 = SI_INDIRECT_DST_V4 %vgpr0_vgpr1_vgpr2_vgpr3, killed %vgpr4, 0, undef %vgpr10, implicit-def dead %exec, implicit-def dead %vcc, implicit-def dead %m0, implicit %exec + S_WAITCNT 127 + BUFFER_STORE_DWORDX4_OFFSET killed %vgpr0_vgpr1_vgpr2_vgpr3, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (store 16 into %ir.out) + S_ENDPGM + +...