Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4706,10 +4706,10 @@ } // Emit the actual waterfall loop, executing the wrapped instruction for each -// unique value of \p Rsrc across all lanes. In the best case we execute 1 +// unique value of \p Rsrc (4 dwords) across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). static void -emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, +emitLoadSRsrcFromVGPR128Loop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, MachineOperand &Rsrc) { MachineFunction &MF = *OrigBB.getParent(); @@ -4790,6 +4790,129 @@ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); } + +// Emit the actual waterfall loop, executing the wrapped instruction for each +// unique value of \p Rsrc (8 dwords) across all lanes. +static void +emitLoadSRsrcFromVGPR256Loop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, + const DebugLoc &DL, MachineOperand &Rsrc) { + MachineFunction &MF = *OrigBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned SaveExecOpc = + ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + unsigned XorTermOpc = + ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + unsigned AndOpc = + ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + MachineBasicBlock::iterator I = LoopBB.begin(); + + Register VRsrc = Rsrc.getReg(); + unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); + + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg2 = MRI.createVirtualRegister(BoolXExecRC); + Register CondReg3 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond0 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond1 = MRI.createVirtualRegister(BoolXExecRC); + Register AndCond = MRI.createVirtualRegister(BoolXExecRC); + Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub4 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub5 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub6 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrcSub7 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_256RegClass); + + // Beginning of the loop, read the next Rsrc variant. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub4) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub4); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub5) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub5); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub6) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub6); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub7) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub7); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(SRsrcSub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcSub1) + .addImm(AMDGPU::sub1) + .addReg(SRsrcSub2) + .addImm(AMDGPU::sub2) + .addReg(SRsrcSub3) + .addImm(AMDGPU::sub3) + .addReg(SRsrcSub4) + .addImm(AMDGPU::sub4) + .addReg(SRsrcSub5) + .addImm(AMDGPU::sub5) + .addReg(SRsrcSub6) + .addImm(AMDGPU::sub6) + .addReg(SRsrcSub7) + .addImm(AMDGPU::sub7); + + // Update Rsrc operand to use the SGPR Rsrc. + Rsrc.setReg(SRsrc); + Rsrc.setIsKill(true); + + // Identify all lanes with identical Rsrc operands in their VGPRs. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) + .addReg(SRsrc, 0, AMDGPU::sub0_sub1) + .addReg(VRsrc, 0, AMDGPU::sub0_sub1); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) + .addReg(SRsrc, 0, AMDGPU::sub2_sub3) + .addReg(VRsrc, 0, AMDGPU::sub2_sub3); + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond0) + .addReg(CondReg0) + .addReg(CondReg1); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg2) + .addReg(SRsrc, 0, AMDGPU::sub4_sub5) + .addReg(VRsrc, 0, AMDGPU::sub4_sub5); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg3) + .addReg(SRsrc, 0, AMDGPU::sub6_sub7) + .addReg(VRsrc, 0, AMDGPU::sub6_sub7); + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond1) + .addReg(CondReg2) + .addReg(CondReg3); + + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndCond) + .addReg(AndCond0) + .addReg(AndCond1); + + MRI.setSimpleHint(SaveExec, AndCond); + + // Update EXEC to matching lanes, saving original to SaveExec. + BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) + .addReg(AndCond, RegState::Kill); + + // The original instruction is here; we insert the terminators after it. + I = LoopBB.end(); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) + .addReg(Exec) + .addReg(SaveExec); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); +} + // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register // with SGPRs by iterating over all unique values across all lanes. static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, @@ -4853,7 +4976,14 @@ } } - emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + // Emit waterfall loop based on Rsrc size. + const TargetRegisterClass *OpRC = MRI.getRegClass(Rsrc.getReg()); + if (OpRC == &AMDGPU::VReg_128RegClass) + emitLoadSRsrcFromVGPR128Loop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + else if (OpRC == &AMDGPU::VReg_256RegClass) + emitLoadSRsrcFromVGPR256Loop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + else // Not implemented yet. + assert(false && "waterfall loop not imlemented for Rsrc size"); // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); @@ -5042,16 +5172,13 @@ (AMDGPU::isShader(MF.getFunction().getCallingConv()) && (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); - if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); - SRsrc->setReg(SGPR); - } + if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) + loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); - if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); - SSamp->setReg(SGPR); - } + if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) + loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); + return; } Index: llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + + +declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) + +; GCN-LABEL: {{^}}water_loop_rsrc: + +; GCN: [[RSRC_LOOP:[a-zA-Z0-9_]+]]: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s[[SREG0:[0-9]+]], v[[VREG0:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG1:[0-9]+]], v[[VREG1:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG2:[0-9]+]], v[[VREG2:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG3:[0-9]+]], v[[VREG3:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG4:[0-9]+]], v[[VREG4:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG5:[0-9]+]], v[[VREG5:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG6:[0-9]+]], v[[VREG6:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG7:[0-9]+]], v[[VREG7:[0-9]+]] + +; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s{{\[}}[[SREG0]]:[[SREG1]]{{\]}}, v{{\[}}[[VREG0]]:[[VREG1]]{{\]}} +; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG2]]:[[SREG3]]{{\]}}, v{{\[}}[[VREG2]]:[[VREG3]]{{\]}} +; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP2:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG4]]:[[SREG5]]{{\]}}, v{{\[}}[[VREG4]]:[[VREG5]]{{\]}} +; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP3:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG6]]:[[SREG7]]{{\]}}, v{{\[}}[[VREG6]]:[[VREG7]]{{\]}} + +; GCN-NEXT: s_and_b64 [[AND0:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] +; GCN-NEXT: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[CMP2]], [[CMP3]] +; GCN-NEXT: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[AND0]], [[AND1]] +; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] + +; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] +; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]] +define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + + +; GCN-LABEL: {{^}}water_loop_samp: + +; GCN: [[SAMP_LOOP:[a-zA-Z0-9_]+]]: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_readfirstlane_b32 s[[SREG0:[0-9]+]], v[[VREG0:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG1:[0-9]+]], v[[VREG1:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG2:[0-9]+]], v[[VREG2:[0-9]+]] +; GCN-NEXT: v_readfirstlane_b32 s[[SREG3:[0-9]+]], v[[VREG3:[0-9]+]] + +; GCN-NEXT: v_cmp_eq_u64_e32 [[CMP0:vcc]], s{{\[}}[[SREG0]]:[[SREG1]]{{\]}}, v{{\[}}[[VREG0]]:[[VREG1]]{{\]}} +; GCN-NEXT: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SREG2]]:[[SREG3]]{{\]}}, v{{\[}}[[VREG2]]:[[VREG3]]{{\]}} +; GCN-NEXT: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] +; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] +; GCN-NEXT: s_nop 0 + +; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1 +; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]] +; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]] +define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + Index: llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,10 +14,7 @@ ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9: v_mov_b32_e32 v37, v11 -; GFX9-NEXT: v_mov_b32_e32 v38, v10 -; GFX9-NEXT: v_mov_b32_e32 v49, v9 -; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9: v_writelane_b32 v44, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 ; GFX9-NEXT: v_mov_b32_e32 v34, v14 @@ -27,7 +24,7 @@ ; GFX9: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 +; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 @@ -58,11 +55,12 @@ ; GFX10: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 +; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -79,7 +77,7 @@ call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0 call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0 call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0 - %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) call void @extern_func() ret <4 x float> %v } @@ -104,16 +102,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v41, v13 ; GFX9-NEXT: v_mov_b32_e32 v40, v12 -; GFX9: s_getpc_b64 s[4:5] +; GFX9: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 ; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -132,13 +130,14 @@ ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10: s_getpc_b64 s[4:5] + +; GFX10: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 -; GFX10-NEXT: v_mov_b32_e32 v40, v16 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v42, v14 ; GFX10-NEXT: v_mov_b32_e32 v43, v13 ; GFX10-NEXT: v_mov_b32_e32 v44, v12 @@ -147,7 +146,7 @@ ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1 +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10: buffer_load_dword v44, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 @@ -157,10 +156,10 @@ ; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; GFX10: s_setpc_b64 s[4:5] main_body: - %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) store <4 x float> %v, <4 x float> addrspace(1)* undef call void @extern_func() - %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) + %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) ret <4 x float> %v1 }