Index: lib/Target/R600/SIInstrInfo.h =================================================================== --- lib/Target/R600/SIInstrInfo.h +++ lib/Target/R600/SIInstrInfo.h @@ -62,6 +62,10 @@ return RI; } + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const final; Index: lib/Target/R600/SIInstrInfo.cpp =================================================================== --- lib/Target/R600/SIInstrInfo.cpp +++ lib/Target/R600/SIInstrInfo.cpp @@ -32,6 +32,99 @@ // TargetInstrInfo callbacks //===----------------------------------------------------------------------===// +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + if (isDS(Opc0) && isDS(Opc1)) { + assert(getNumOperandsNoGlue(Load1) == getNumOperandsNoGlue(Load1)); + + // TODO: Also shouldn't see read2st + assert(Opc0 != AMDGPU::DS_READ2_B32 && + Opc0 != AMDGPU::DS_READ2_B64 && + Opc1 != AMDGPU::DS_READ2_B32 && + Opc1 != AMDGPU::DS_READ2_B64); + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = cast(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load1) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = cast(Load0->getOperand(1))->getZExtValue(); + Offset1 = cast(Load1->getOperand(1))->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + // Skip if an SGPR offset is applied. I don't think we ever emit any of + // variants that use this currently. + int SoffsetIdx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::soffset); + if (SoffsetIdx != -1) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --SoffsetIdx; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // MUBUF and MTBUF have vaddr at different indices. + int VaddrIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::vaddr) - 1; + int VaddrIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::vaddr) - 1; + if (Load0->getOperand(VaddrIdx0) != Load1->getOperand(VaddrIdx1)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset) - 1; + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset) - 1; + Offset0 = cast(Load0->getOperand(OffIdx0))->getZExtValue(); + Offset1 = cast(Load1->getOperand(OffIdx1))->getZExtValue(); + return true; + } + + return false; +} + bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const { Index: test/CodeGen/R600/address-space.ll =================================================================== --- test/CodeGen/R600/address-space.ll +++ test/CodeGen/R600/address-space.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s ; Test that codegenprepare understands address space sizes @@ -10,8 +10,8 @@ ; CHECK-LABEL: @do_as_ptr_calcs: ; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]], ; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]] -; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14 -; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc +; CHECK-DAG: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0xc +; CHECK-DAG: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0x14 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { entry: %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 Index: test/CodeGen/R600/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/R600/atomic_cmp_swap_local.ll +++ test/CodeGen/R600/atomic_cmp_swap_local.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset: -; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7 ; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] @@ -17,8 +17,8 @@ } ; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset: -; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: S_MOV_B64 s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7 ; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]] ; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]] Index: test/CodeGen/R600/ctpop64.ll =================================================================== --- test/CodeGen/R600/ctpop64.ll +++ test/CodeGen/R600/ctpop64.ll @@ -7,7 +7,7 @@ declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone ; FUNC-LABEL: @s_ctpop_i64: -; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]], +; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]] ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] ; SI: BUFFER_STORE_DWORD [[VRESULT]], Index: test/CodeGen/R600/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/R600/cvt_f32_ubyte.ll +++ test/CodeGen/R600/cvt_f32_ubyte.ll @@ -68,13 +68,13 @@ ; SI-LABEL: @load_v4i8_to_v4f32_2_uses: ; SI: BUFFER_LOAD_UBYTE -; SI: V_CVT_F32_UBYTE0_e32 ; SI: BUFFER_LOAD_UBYTE -; SI: V_CVT_F32_UBYTE0_e32 ; SI: BUFFER_LOAD_UBYTE -; SI: V_CVT_F32_UBYTE0_e32 ; SI: BUFFER_LOAD_UBYTE ; SI: V_CVT_F32_UBYTE0_e32 +; SI: V_CVT_F32_UBYTE0_e32 +; SI: V_CVT_F32_UBYTE0_e32 +; SI: V_CVT_F32_UBYTE0_e32 ; XXX - replace with this when v4i8 loads aren't scalarized anymore. ; XSI: BUFFER_LOAD_DWORD Index: test/CodeGen/R600/extract_vector_elt_i16.ll =================================================================== --- test/CodeGen/R600/extract_vector_elt_i16.ll +++ test/CodeGen/R600/extract_vector_elt_i16.ll @@ -2,9 +2,9 @@ ; FUNC-LABEL: @extract_vector_elt_v2i16 ; SI: BUFFER_LOAD_USHORT -; SI: BUFFER_STORE_SHORT ; SI: BUFFER_LOAD_USHORT ; SI: BUFFER_STORE_SHORT +; SI: BUFFER_STORE_SHORT define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind { %p0 = extractelement <2 x i16> %foo, i32 0 %p1 = extractelement <2 x i16> %foo, i32 1 @@ -16,9 +16,9 @@ ; FUNC-LABEL: @extract_vector_elt_v4i16 ; SI: BUFFER_LOAD_USHORT -; SI: BUFFER_STORE_SHORT ; SI: BUFFER_LOAD_USHORT ; SI: BUFFER_STORE_SHORT +; SI: BUFFER_STORE_SHORT define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind { %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 Index: test/CodeGen/R600/fcopysign.f32.ll =================================================================== --- test/CodeGen/R600/fcopysign.f32.ll +++ test/CodeGen/R600/fcopysign.f32.ll @@ -8,9 +8,9 @@ ; Try to identify arg based on higher address. ; FUNC-LABEL: @test_copysign_f32: +; SI: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb ; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc ; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] -; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb ; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]] ; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff ; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] Index: test/CodeGen/R600/fcopysign.f64.ll =================================================================== --- test/CodeGen/R600/fcopysign.f64.ll +++ test/CodeGen/R600/fcopysign.f64.ll @@ -5,9 +5,9 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone ; FUNC-LABEL: @test_copysign_f64: -; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] ; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff ; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] Index: test/CodeGen/R600/llvm.memcpy.ll =================================================================== --- test/CodeGen/R600/llvm.memcpy.ll +++ test/CodeGen/R600/llvm.memcpy.ll @@ -40,37 +40,37 @@ ; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 - ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 -; SI: DS_WRITE_B8 ; SI: DS_READ_U8 + +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 +; SI: DS_WRITE_B8 ; SI: DS_WRITE_B8 ; SI: S_ENDPGM @@ -100,20 +100,21 @@ ; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 -; SI: DS_WRITE_B16 ; SI: DS_READ_U16 + +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 +; SI: DS_WRITE_B16 ; SI: DS_WRITE_B16 ; SI: S_ENDPGM Index: test/CodeGen/R600/mubuf.ll =================================================================== --- test/CodeGen/R600/mubuf.ll +++ test/CodeGen/R600/mubuf.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s ;;;==========================================================================;;; ;;; MUBUF LOAD TESTS @@ -28,7 +28,7 @@ ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits ; CHECK-LABEL: @mubuf_load2 -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80 +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = getelementptr i32 addrspace(1)* %in, i64 1024 Index: test/CodeGen/R600/rotl.i64.ll =================================================================== --- test/CodeGen/R600/rotl.i64.ll +++ test/CodeGen/R600/rotl.i64.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: @s_rotl_i64: -; SI: S_LSHL_B64 ; SI: S_SUB_I32 ; SI: S_LSHR_B64 +; SI: S_LSHL_B64 ; SI: S_OR_B64 define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { entry: Index: test/CodeGen/R600/schedule-global-loads.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/schedule-global-loads.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FIXME: This currently doesn't do a great job of clustering the +; loads, which end up with extra moves between them. Right now, it +; seems the only things areLoadsFromSameBasePtr is accomplishing is +; ordering the loads so that the lower address loads come first. + +; FUNC-LABEL: @cluster_global_arg_loads +; SI: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}} + v{{\[[0-9]+:[0-9]+\]}} + 0x0 +; SI: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}} + v{{\[[0-9]+:[0-9]+\]}} + 0x4 +; SI: BUFFER_STORE_DWORD [[REG0]] +; SI: BUFFER_STORE_DWORD [[REG1]] +define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { + %load0 = load i32 addrspace(1)* %ptr, align 4 + %gep = getelementptr i32 addrspace(1)* %ptr, i32 1 + %load1 = load i32 addrspace(1)* %gep, align 4 + store i32 %load0, i32 addrspace(1)* %out0, align 4 + store i32 %load1, i32 addrspace(1)* %out1, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/R600/schedule-kernel-arg-loads.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/schedule-kernel-arg-loads.ll @@ -0,0 +1,12 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + +; FUNC-LABEL: @cluster_arg_loads +; SI: S_LOAD_DWORDX2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SI-NEXT: S_LOAD_DWORDX2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NEXT: S_LOAD_DWORD s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-NEXT: S_LOAD_DWORD s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe +define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { + store i32 %x, i32 addrspace(1)* %out0, align 4 + store i32 %y, i32 addrspace(1)* %out1, align 4 + ret void +} Index: test/CodeGen/R600/trunc.ll =================================================================== --- test/CodeGen/R600/trunc.ll +++ test/CodeGen/R600/trunc.ll @@ -30,7 +30,7 @@ } ; SI-LABEL: @trunc_shl_i64: -; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, +; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]], ; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2 ; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] Index: test/CodeGen/R600/wait.ll =================================================================== --- test/CodeGen/R600/wait.ll +++ test/CodeGen/R600/wait.ll @@ -1,37 +1,45 @@ -; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -;CHECK-LABEL: @main -;CHECK: S_WAITCNT lgkmcnt(0) -;CHECK: S_WAITCNT vmcnt(0) -;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0) - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 { +; CHECK-LABEL: @main +; CHECK: S_LOAD_DWORDX4 +; CHECK: S_LOAD_DWORDX4 +; CHECK: S_WAITCNT lgkmcnt(0) +; CHECK: S_WAITCNT vmcnt(0) +; CHECK: S_WAITCNT expcnt(0) lgkmcnt(0) +define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { main_body: - %10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0 - %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0 - %12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6) - %13 = extractelement <4 x float> %12, i32 0 - %14 = extractelement <4 x float> %12, i32 1 - %15 = extractelement <4 x float> %12, i32 2 - %16 = extractelement <4 x float> %12, i32 3 - %17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1 - %18 = load <16 x i8> addrspace(2)* %17, !tbaa !0 - %19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6) - %20 = extractelement <4 x float> %19, i32 0 - %21 = extractelement <4 x float> %19, i32 1 - %22 = extractelement <4 x float> %19, i32 2 - %23 = extractelement <4 x float> %19, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16) + %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0 + %tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) + %tmp12 = extractelement <4 x float> %tmp11, i32 0 + %tmp13 = extractelement <4 x float> %tmp11, i32 1 + call void @llvm.AMDGPU.barrier.global() #1 + %tmp14 = extractelement <4 x float> %tmp11, i32 2 +; %tmp15 = extractelement <4 x float> %tmp11, i32 3 + %tmp15 = load float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt + %tmp16 = getelementptr <16 x i8> addrspace(2)* %arg3, i32 1 + %tmp17 = load <16 x i8> addrspace(2)* %tmp16, !tbaa !0 + %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) + %tmp19 = extractelement <4 x float> %tmp18, i32 0 + %tmp20 = extractelement <4 x float> %tmp18, i32 1 + %tmp21 = extractelement <4 x float> %tmp18, i32 2 + %tmp22 = extractelement <4 x float> %tmp18, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) ret void } +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.global() #1 + ; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1 +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readnone } +attributes #1 = { noduplicate nounwind } +attributes #2 = { nounwind readnone } -!0 = metadata !{metadata !"const", null, i32 1} +!0 = metadata !{metadata !1, metadata !1, i64 0, i32 1} +!1 = metadata !{metadata !"const", null}