Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -48,6 +48,12 @@ namespace { +static cl::opt WidenLoads( + "amdgpu-codegenprepare-widen-constant-loads", + cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(true)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor { const SISubtarget *ST = nullptr; @@ -472,6 +478,9 @@ } bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { + if (!WidenLoads) + return false; + if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -46,6 +46,8 @@ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + + SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1081,6 +1081,36 @@ PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + + // Try to avoid using an extload by loading earlier than the argument address, + // and extracting the relevant bits. The load should hopefully be merged with + // the previous argument. + if (Align < 4) { + //if (MemVT.getStoreSize() < 4) { + assert(MemVT.getStoreSize() < 4); + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + + EVT IntVT = MemVT.changeTypeToInteger(); + + // TODO: If we passed in the base kernel offset we could have a better + // alignment than 4, but we don't really need it. + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); + ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); + + + return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); + } + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, MachineMemOperand::MODereferenceable | @@ -5283,6 +5313,96 @@ } } +static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, + ISD::LoadExtType ExtType, SDValue Op, + const SDLoc &SL, EVT VT) { + if (VT.bitsLT(Op.getValueType())) + return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); + + switch (ExtType) { + case ISD::SEXTLOAD: + return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op); + case ISD::ZEXTLOAD: + return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op); + case ISD::EXTLOAD: + return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op); + case ISD::NON_EXTLOAD: + return Op; + } + + llvm_unreachable("invalid ext type"); +} + +SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + if (Ld->getAlignment() < 4 || Ld->isDivergent()) + return SDValue(); + + // FIXME: Constant loads should all be marked invariant. + unsigned AS = Ld->getAddressSpace(); + if (AS != AMDGPUASI.CONSTANT_ADDRESS && + AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT && + (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) + return SDValue(); + + // Don't do this early, since it may interfere with adjacent load merging for + // illegal types. We can avoid losing alignment information for exotic types + // pre-legalize. + EVT MemVT = Ld->getMemoryVT(); + if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || + MemVT.getSizeInBits() >= 32) + return SDValue(); + + SDLoc SL(Ld); + + assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && + "unexpected vector extload"); + + // TODO: Drop only high part of range. + SDValue Ptr = Ld->getBasePtr(); + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + MVT::i32, SL, Ld->getChain(), Ptr, + Ld->getOffset(), + Ld->getPointerInfo(), MVT::i32, + Ld->getAlignment(), + Ld->getMemOperand()->getFlags(), + Ld->getAAInfo(), + nullptr); // Drop ranges + + EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + if (MemVT.isFloatingPoint()) { + assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && + "unexpected fp extload"); + TruncVT = MemVT.changeTypeToInteger(); + } + + SDValue Cvt = NewLoad; + if (Ld->getExtensionType() == ISD::SEXTLOAD) { + Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, + DAG.getValueType(TruncVT)); + } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || + Ld->getExtensionType() == ISD::NON_EXTLOAD) { + Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); + } else { + assert(Ld->getExtensionType() == ISD::EXTLOAD); + } + + EVT VT = Ld->getValueType(0); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + + DCI.AddToWorklist(Cvt.getNode()); + + // We may need to handle exotic cases, such as i16->i64 extloads, so insert + // the appropriate extension from the 32-bit load. + Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); + DCI.AddToWorklist(Cvt.getNode()); + + // Handle conversion back to floating point if necessary. + Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); + + return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL); +} + SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast(Op); @@ -7183,7 +7303,11 @@ return performMinMaxCombine(N, DCI); break; } - case ISD::LOAD: + case ISD::LOAD: { + if (SDValue Widended = widenLoad(cast(N), DCI)) + return Widended; + LLVM_FALLTHROUGH; + } case ISD::STORE: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -178,7 +178,12 @@ ; FIXME: Should use SGPRs ; FUNC-LABEL: {{^}}s_and_i1: -; SI: v_and_b32 +; SI: s_load_dword [[LOAD:s[0-9]+]] +; SI: s_lshr_b32 [[B_SHIFT:s[0-9]+]], [[LOAD]], 8 +; SI: s_and_b32 [[AND:s[0-9]+]], [[LOAD]], [[B_SHIFT]] +; SI: s_and_b32 [[AND_TRUNC:s[0-9]+]], [[AND]], 1{{$}} +; SI: v_mov_b32_e32 [[V_AND_TRUNC:v[0-9]+]], [[AND_TRUNC]] +; SI: buffer_store_byte [[V_AND_TRUNC]] define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { %and = and i1 %a, %b store i1 %and, i1 addrspace(1)* %out Index: test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/ashr.v2i16.ll +++ test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -2,14 +2,16 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; FIXME: Should be same on CI/VI ; GCN-LABEL: {{^}}s_ashr_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] ; GFX9: s_load_dword [[RHS:s[0-9]+]] ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: s_load_dword [[LHS:s[0-9]+]] -; VI: s_load_dword [[RHS:s[0-9]+]] +; CIVI: s_load_dword [[LHS:s[0-9]+]] +; CIVI: s_load_dword [[RHS:s[0-9]+]] + ; VI: s_ashr_i32 ; VI: s_ashr_i32 ; VI: s_sext_i32_i16 @@ -20,11 +22,14 @@ ; VI: s_and_b32 ; VI: s_or_b32 -; CI-DAG: v_ashrrev_i32_e32 -; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 +; CI: s_ashr_i32 +; CI: s_and_b32 +; CI: s_lshr_b32 +; CI: s_sext_i32_i16 +; CI: s_ashr_i32 +; CI: s_ashr_i32 +; CI: s_lshl_b32 +; CI: s_and_b32 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- test/CodeGen/AMDGPU/basic-branch.ll +++ test/CodeGen/AMDGPU/basic-branch.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_branch: ; GCNNOOPT: v_writelane_b32 @@ -28,10 +28,11 @@ } ; GCN-LABEL: {{^}}test_brcc_i1: -; GCN: buffer_load_ubyte -; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, -; GCN: v_cmp_eq_u32_e32 vcc, -; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]] +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]] +; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1 +; GCN: s_cmp_eq_u32 +; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] ; GCN: buffer_store_dword Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll =================================================================== --- test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -105,8 +105,9 @@ ; GCN: s_cbranch_vccnz [[LOOPBB]] ; GCN-NEXT: ; %bb.2 ; GCN-NEXT: s_endpgm -define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { +define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind { entry: + %cond = load volatile i1, i1 addrspace(3)* null br label %for.body for.exit: Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -58,9 +58,8 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: s_load_dword s +; GCN: s_load_dword s define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 { %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 @@ -70,10 +69,11 @@ ret void } +; FIXME: Why sometimes vector shift? ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s ; GFX9-DAG: global_load_short_d16_hi v ; GFX9-DAG: global_load_short_d16 v @@ -81,7 +81,7 @@ ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 ; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v -; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,GFX89 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}extract_vector_elt_v2i16: @@ -58,7 +58,8 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; GCN: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s ; GCN: buffer_store_short ; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { @@ -71,10 +72,10 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SICI: buffer_load_ushort -; SICI: buffer_load_ushort -; SICI: buffer_store_short -; SICI: buffer_store_short +; SI: s_load_dword s +; SI: s_load_dword s +; SI: buffer_store_short +; SI: buffer_store_short ; VI: s_load_dword s ; VI: s_load_dword s @@ -97,24 +98,19 @@ } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SICI: buffer_load_ushort -; SICI: buffer_load_ushort -; SICI: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}} -; SICI: buffer_store_short -; SICI: buffer_store_short -; SICI: buffer_store_short - -; SICI: buffer_load_ushort -; SICI: buffer_store_short - -; GFX9-DAG: global_load_short_d16_hi v -; GFX9-DAG: global_load_short_d16 v +; FIXME: Unnecessary repacking +; GFX9: s_pack_ll_b32_b16 +; GFX9: s_pack_lh_b32_b16 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v -; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} + +; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s ; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { Index: test/CodeGen/AMDGPU/extract_vector_elt-i8.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -2,8 +2,9 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}extract_vector_elt_v1i8: -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD:s[0-9]+]] +; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_byte [[V_LOAD]] define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { %p0 = extractelement <1 x i8> %foo, i32 0 store i8 %p0, i8 addrspace(1)* %out @@ -11,8 +12,10 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v2i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { @@ -25,8 +28,10 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { @@ -39,8 +44,10 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v4i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { @@ -53,8 +60,10 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v8i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { @@ -67,10 +76,13 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v16i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD0:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN: buffer_store_byte [[V_ELT2]] +; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { %p0 = extractelement <16 x i8> %foo, i32 0 %p1 = extractelement <16 x i8> %foo, i32 2 @@ -81,10 +93,13 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v32i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD0:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN: buffer_store_byte [[V_ELT2]] +; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { %p0 = extractelement <32 x i8> %foo, i32 0 %p1 = extractelement <32 x i8> %foo, i32 2 @@ -95,10 +110,13 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v64i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD0:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN: buffer_store_byte [[V_ELT2]] +; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { %p0 = extractelement <64 x i8> %foo, i32 0 %p1 = extractelement <64 x i8> %foo, i32 2 @@ -110,12 +128,19 @@ ; FIXME: SI generates much worse code from that's a pain to match -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]], -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; FIXME: 16-bit and 32-bit shift not combined after legalize to to +; isTypeDesirableForOp in SimplifyDemandedBits -; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[LOAD]] +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: +; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI-NOT: {{flat|buffer|global}} +; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8 +; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]] +; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]] ; VI: buffer_store_byte [[EXTRACT]] define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 { %elt = extractelement <2 x i8> %foo, i32 %idx @@ -124,14 +149,14 @@ } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI-DAG: buffer_load_ubyte [[LOAD2:v[0-9]+]], -; VI-DAG: buffer_load_ushort [[LOAD01:v[0-9]+]], -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 - +; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI-NOT: {{flat|buffer|global}} +; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8 +; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 - -; VI: v_lshlrev_b32_e32 [[ELT2:v[0-9]+]], 16, [[LOAD2]] -; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[LOAD01]], [[ELT2]] ; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]] ; VI: buffer_store_byte [[EXTRACT]] define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { @@ -142,30 +167,33 @@ } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; VI-DAG: s_load_dword [[VEC3:s[0-9]+]], s[0:1], 0x2c -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC3]], [[SCALED_IDX]] +; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]] + ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]] ; VI: buffer_store_byte [[V_EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { - %p0 = extractelement <4 x i8> %foo, i32 %idx +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr + %p0 = extractelement <4 x i8> %vec, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: -; VI-DAG: s_load_dwordx2 [[VEC3:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC3]], [[SCALED_IDX]] +; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]] ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]] ; VI: buffer_store_byte [[V_EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo, i32 %idx) #0 { - %p0 = extractelement <8 x i8> %foo, i32 %idx +define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr + %p0 = extractelement <8 x i8> %vec, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,16 +1,21 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) ; unless isFabsFree returns true ; GCN-LABEL: {{^}}s_fabs_free_f16: -; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]], -; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_load_dword [[VAL:s[0-9]+]] + +; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff +; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] +; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] +; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff +; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -19,9 +24,15 @@ } ; GCN-LABEL: {{^}}s_fabs_f16: -; CI: flat_load_ushort [[VAL:v[0-9]+]], -; CI: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_load_dword [[VAL:s[0-9]+]] + +; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff +; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] +; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] + +; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff +; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, half addrspace(1)* %out @@ -43,7 +54,6 @@ ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] - ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) @@ -52,18 +62,18 @@ } ; GCN-LABEL: {{^}}fabs_fold_f16: -; GCN: {{flat|global}}_load_ushort [[IN0:v[0-9]+]] -; GCN: {{flat|global}}_load_ushort [[IN1:v[0-9]+]] +; GCN: s_load_dword [[IN0:s[0-9]+]] +; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16 -; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] -; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| -; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]] -; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] +; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]| +; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]] +; CI-DAG: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] +; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] -; VI-NOT: and -; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]] +; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -3,12 +3,12 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: -; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e32 +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |s{{[0-9]+}}| ; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]] ; GFX89-NOT: _and -; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| +; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs @@ -19,12 +19,12 @@ ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: ; CI-DAG: v_cvt_f32_f16_e32 -; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{s[0-9]+}}| ; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]] ; CI: v_cvt_f16_f32_e32 ; GFX89-NOT: _and -; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}| ; GFX89-NOT: [[MUL]] ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { @@ -40,7 +40,7 @@ ; unless isFabsFree returns true ; GCN-LABEL: {{^}}fneg_fabs_free_f16: -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -50,7 +50,7 @@ } ; GCN-LABEL: {{^}}fneg_fabs_f16: -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -28,13 +28,17 @@ ret void } -; GCN-LABEL: {{^}}fneg_free_f16: -; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]], +; GCN-LABEL: {{^}}s_fneg_free_f16: +; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]], -; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} -; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { +; CI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} +; CI: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]] +; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]] + +; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000 +; GFX89: v_xor_b32_e32 [[XOR:v[0-9]+]], [[NEG_VALUE]], [[MASK]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] +define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, half addrspace(1)* %out Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -4,9 +4,9 @@ ; half args should be promoted to float for SI and lower. ; GCN-LABEL: {{^}}load_f16_arg: -; GCN: flat_load_ushort [[ARG:v[0-9]+]] -; GCN-NOT: [[ARG]] -; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ARG]] +; GCN: s_load_dword [[ARG:s[0-9]+]] +; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] +; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_ARG]] define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { store half %arg, half addrspace(1)* %out ret void @@ -22,8 +22,9 @@ } ; GCN-LABEL: {{^}}load_v3f16_arg: -; GCN: flat_load_ushort ; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: {buffer|flat|global}}_load_ ; GCN-NOT: _load ; GCN-DAG: _store_dword @@ -76,10 +77,9 @@ } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; GCN: flat_load_ushort -; GCN: flat_load_ushort -; GCN: flat_load_ushort -; GCN-NOT: {{buffer|flat|global}}_load +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -101,20 +101,10 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort - - -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -134,7 +124,7 @@ } ; GCN-LABEL: {{^}}extload_f16_to_f64_arg: -; GCN: flat_load_ushort [[ARG:v[0-9]+]] +; GCN: s_load_dword [[ARG:s[0-9]+]] ; GCN: v_cvt_f32_f16_e32 v[[ARG_F32:[0-9]+]], [[ARG]] ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[ARG_F32]] ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -145,11 +135,8 @@ } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; SI-DAG: flat_load_ushort v -; SI-DAG: flat_load_ushort v - -; VI-DAG: s_load_dword s -; VI: s_lshr_b32 +; GCN: s_load_dword +; GCN: s_lshr_b32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -163,9 +150,10 @@ } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -; GCN-DAG: flat_load_ushort v -; GCN-DAG: flat_load_ushort v -; GCN-DAG: flat_load_ushort v +; GCN: s_load_dword +; GCN: s_load_dword +; GCN: s_lshr_b32 + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -180,13 +168,8 @@ } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v - -; VI: s_load_dword s -; VI: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -204,23 +187,11 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v - -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v - - -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s - - +; GCN: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NOT: _load_ ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -237,10 +208,10 @@ ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 ; GCN: s_endpgm define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { Index: test/CodeGen/AMDGPU/imm16.ll =================================================================== --- test/CodeGen/AMDGPU/imm16.ll +++ test/CodeGen/AMDGPU/imm16.ll @@ -124,8 +124,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_0.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.0 @@ -134,8 +134,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_0.5_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.5 @@ -144,8 +144,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -0.5 @@ -154,8 +154,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_1.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 1.0 @@ -164,8 +164,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -1.0 @@ -174,8 +174,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_2.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 2.0 @@ -184,8 +184,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -2.0 @@ -194,8 +194,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_4.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 4.0 @@ -204,8 +204,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -4.0 @@ -236,8 +236,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_1_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0001 @@ -246,8 +246,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_2_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0002 @@ -256,8 +256,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_16_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 16{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0010 @@ -268,9 +268,9 @@ ; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, -1 ; VI: buffer_store_short [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { - %xbc = bitcast half %x to i16 - %y = add i16 %xbc, -1 +define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) { + %x = load i16, i16 addrspace(1)* %in + %y = add i16 %x, -1 %ybc = bitcast i16 %y to half store half %ybc, half addrspace(1)* %out ret void @@ -279,9 +279,9 @@ ; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfffe ; VI: buffer_store_short [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { - %xbc = bitcast half %x to i16 - %y = add i16 %xbc, -2 +define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) { + %x = load i16, i16 addrspace(1)* %in + %y = add i16 %x, -2 %ybc = bitcast i16 %y to half store half %ybc, half addrspace(1)* %out ret void @@ -290,17 +290,17 @@ ; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfff0 ; VI: buffer_store_short [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { - %xbc = bitcast half %x to i16 - %y = add i16 %xbc, -16 +define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) { + %x = load i16, i16 addrspace(1)* %in + %y = add i16 %x, -16 %ybc = bitcast i16 %y to half store half %ybc, half addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_63_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 63 ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH003F @@ -309,8 +309,8 @@ } ; GCN-LABEL: {{^}}add_inline_imm_64_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 64 ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0040 Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -202,14 +202,21 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8: -; VI: buffer_load_ushort [[LOAD:v[0-9]]] -; VI: s_load_dword [[IDX:s[0-9]]] +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-NOT: _load +; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshlrev_b16_e64 [[SHL:v[0-9]+]], [[SCALED_IDX]], -1 -; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[SHL]] -; VI: v_and_b32_e32 [[AND0:v[0-9]+]], 5, [[SHL]] -; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[LOAD]] -; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[AND0]], [[AND1]] +; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 + +; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]] +; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]] + +; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]] +; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] +; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]] ; VI: buffer_store_short [[OR]] define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b @@ -217,17 +224,32 @@ ret void } +; FIXME: post legalize i16 and i32 shifts aren't merged because of +; isTypeDesirableForOp in SimplifyDemandedBits + ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8: -; VI: buffer_load_ubyte -; VI: buffer_load_ushort -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 3 -; VI: s_lshl_b32 s{{[0-9]+}}, 0xffff, -; VI: s_not_b32 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_or_b32_e32 -; VI: v_and_b32 -; VI: v_bfi_b32 -; VI: v_lshrrev_b32 +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-NOT: _load + +; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8 +; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]] +; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}} + +; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}} +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]] + +; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]] +; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]] +; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]] +; VI: buffer_store_short [[BFI]] +; VI: buffer_store_byte [[HI2]] define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 @@ -235,21 +257,37 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8: -; VI: s_load_dword [[VEC:s[0-9]+]] -; VI: s_load_dword [[IDX:s[0-9]]] +; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-NOT: _load + +; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8 +; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]] +; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}} + + +; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24 +; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16 +; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]] +; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI-DAG: v_mov_b32_e32 [[V_VEC:v[0-9]+]], [[VEC]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[MASK]], 5, [[V_VEC]] +; VI: v_or_b32_sdwa +; VI: s_lshl_b32 +; VI: v_bfi_b32 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}dynamic_insertelement_v8i8: -; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8: +; VI-NOT: {{buffer|flat|global}} ; VI: s_load_dword [[IDX:s[0-9]]] +; VI-NOT: {{buffer|flat|global}} +; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; VI-NOT: {{buffer|flat|global}} + ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff @@ -261,29 +299,22 @@ ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]] ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} -define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { + %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dwordx2 +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: _load_ + ; GCN: buffer_store_byte ; GCN: buffer_store_byte Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -11,12 +11,10 @@ ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} + +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff + define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { entry: @@ -31,13 +29,9 @@ ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { entry: %0 = zext i8 %in to i32 @@ -50,14 +44,12 @@ ; HSA-VI: kernarg_segment_alignment = 4 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb + ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] +; HSA-VI: flat_store_dword define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { entry: %0 = sext i8 %in to i32 @@ -71,15 +63,13 @@ ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb + ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} +; HSA-VI: flat_store_dword define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { entry: %0 = zext i16 %in to i32 @@ -94,13 +84,10 @@ ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} +; HSA-VI: flat_store_dword define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { entry: %0 = zext i16 %in to i32 @@ -115,13 +102,11 @@ ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} + +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] +; HSA-VI: flat_store_dword define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { entry: %0 = sext i16 %in to i32 @@ -163,10 +148,8 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte - -; HSA: flat_load_ushort +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}}_load_ define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { entry: store <2 x i8> %in, <2 x i8> addrspace(1)* %out @@ -226,15 +209,9 @@ ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte - -; MESA-VI: buffer_load_ushort -; MESA-VI: buffer_load_ubyte -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}}_load_ define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 @@ -249,8 +226,8 @@ ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -; GCN-DAG: s_load_dword s -; GCN-DAG: {{buffer|flat}}_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { entry: store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 @@ -294,12 +271,8 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte - -; VI: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}}_load_ define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(1)* %out @@ -314,7 +287,8 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb +; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 ; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c @@ -361,6 +335,7 @@ ret void } +; FIXME: Lots of unpack and re-pack junk on VI ; FUNC-LABEL: {{^}}v8i8_arg: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 @@ -373,16 +348,23 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dwordx2 s +; SI-NOT: {{buffer|flat|global}}_load + +; VI: s_load_dword s +; VI: s_load_dword s + +; VI: v_lshlrev_b16 +; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa +; VI: v_lshlrev_b16 +; VI: s_lshr_b32 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { entry: store <8 x i8> %in, <8 x i8> addrspace(1)* %out @@ -401,9 +383,13 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI-NOT: {{buffer|flat|global}}_load + ; VI: s_load_dwordx2 ; VI: s_load_dword s @@ -454,6 +440,8 @@ ret void } +; FIXME: Pack/repack on VI + ; FUNC-LABEL: {{^}}v16i8_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 @@ -474,26 +462,33 @@ ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dwordx2 +; SI-NOT: {{buffer|flat|global}}_load -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s + +; VI: s_lshr_b32 +; VI: v_lshlrev_b16 +; VI: s_lshr_b32 +; VI: s_lshr_b32 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa +; VI: v_lshlrev_b16 +; VI: v_lshlrev_b16 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa +; VI: v_lshlrev_b16 +; VI: v_lshlrev_b16 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { entry: store <16 x i8> %in, <16 x i8> addrspace(1)* %out @@ -508,6 +503,7 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 + ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -524,9 +520,13 @@ ; SI: s_load_dword s ; SI: s_load_dword s ; SI: s_load_dword s -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; SI-NOT: {{buffer|flat|global}}_load + ; VI: s_load_dword s ; VI: s_load_dword s @@ -634,10 +634,9 @@ ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 -; SI: buffer_store_byte -; SI: s_endpgm +; GCN: s_load_dword s +; GCN: s_and_b32 +; GCN: {{buffer|flat}}_store_byte define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { store i1 %x, i1 addrspace(1)* %out, align 1 ret void @@ -647,9 +646,8 @@ ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm +; GCN: s_load_dword +; SGCN: buffer_store_dword define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -660,9 +658,8 @@ ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: buffer_store_dwordx2 -; SI: s_endpgm +; GCN: s_load_dword s +; GCN: {{buffer|flat}}_store_dwordx2 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 @@ -673,9 +670,8 @@ ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm +; GCN: s_load_dword +; GCN: {{buffer|flat}}_store_dword define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i32 store i32 %ext, i32addrspace(1)* %out, align 4 @@ -686,11 +682,9 @@ ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: v_ashrrev_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm +; GCN: s_load_dword +; GCN: s_bfe_i64 +; GCN: {{buffer|flat}}_store_dwordx2 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -3,8 +3,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s ; GCN-LABEL: {{^}}buffer_store_format_d16_x: -; GCN: {{buffer|flat|global}}_load_ushort v[[LO:[0-9]+]] -; GCN: buffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; GCN: s_load_dword s[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -24,9 +24,10 @@ } ; GCN-LABEL: {{^}}class_f16_fabs: -; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]] -; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |v[[SA_F16]]|, s[[SB_I32]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |s[[SA_F16]]|, [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -42,10 +43,11 @@ ret void } -; GCN-LABEL: {{^}}class_f16_fneg -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] +; GCN-LABEL: {{^}}class_f16_fneg: +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[SA_F16]], s[[SB_I32]] +; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -s[[SA_F16]], [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -61,10 +63,11 @@ ret void } -; GCN-LABEL: {{^}}class_f16_fabs_fneg -; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]] -; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|v[[SA_F16]]|, s[[SB_I32]] +; GCN-LABEL: {{^}}class_f16_fabs_fneg: +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|s[[SA_F16]]|, [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -82,8 +85,8 @@ } ; GCN-LABEL: {{^}}class_f16_1: -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 1{{$}} +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 1{{$}} ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -98,8 +101,8 @@ } ; GCN-LABEL: {{^}}class_f16_64 -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 64{{$}} +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 64{{$}} ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -114,9 +117,9 @@ } ; GCN-LABEL: {{^}}class_f16_full_mask: -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}} -; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]] +; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -130,10 +133,10 @@ ret void } -; GCN-LABEL: {{^}}class_f16_nine_bit_mask -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] +; GCN-LABEL: {{^}}class_f16_nine_bit_mask: +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}} -; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]] +; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -50,8 +50,9 @@ } ; GCN-LABEL: {{^}}image_store_f16 -; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]], -; GCN: image_store v[[LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 +; GCN: s_load_dword s[[LO:[0-9]+]], +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: image_store v[[V_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -4,8 +4,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_x: -; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]], -; GCN: tbuffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: s_load_dword s[[S_LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -18,10 +18,18 @@ ; VI-DAG: s_lshl_b32 ; VI: v_or_b32_e32 -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: s_load_dword s +; CI-NEXT: s_load_dword s +; CI-NOT: {{buffer|flat}} +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CI: s_and_b32 +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; CI: s_and_b32 +; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: s_lshl_b32 +; CI: v_or_b32_e32 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -76,32 +76,25 @@ ; extloads with mubuf instructions. ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: _load_ -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_min_i16 ; GFX9: v_min_i16 ; GFX9: v_min_i16 ; GFX9: v_min_i16 -; GCN: s_endpgm - ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT @@ -114,8 +107,15 @@ } ; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: -; SI: v_min_i32 -; SI: v_min_i32 +; GCN: s_load_dword s +; GCN: s_load_dword s + +; SI: s_ashr_i32 +; SI: s_ashr_i32 +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: s_min_i32 +; SI: s_min_i32 ; VI: s_sext_i32_i16 ; VI: s_sext_i32_i16 @@ -134,10 +134,11 @@ } ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 +; SI-NOT: buffer_load +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 ; VI: s_min_i32 ; VI: s_min_i32 @@ -453,14 +454,15 @@ } ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 +; GCN-NOT: {{buffer|flat|global}}_load +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 ; VI: s_min_u32 ; VI: s_min_u32 Index: test/CodeGen/AMDGPU/reduce-store-width-alignment.ll =================================================================== --- test/CodeGen/AMDGPU/reduce-store-width-alignment.ll +++ test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -40,7 +40,10 @@ ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4: ; GCN: s_load_dword s -; GCN: s_load_dwordx2 s +; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NOT: {{buffer|flat|global}} + ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { %x.bc = bitcast <4 x i16> %x to <2 x i32> Index: test/CodeGen/AMDGPU/select-i1.ll =================================================================== --- test/CodeGen/AMDGPU/select-i1.ll +++ test/CodeGen/AMDGPU/select-i1.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI +; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN -; FUNC-LABEL: {{^}}select_i1: -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 +; GCN-LABEL: {{^}}select_i1: +; GCN: v_cndmask_b32 +; GCN-NOT: v_cndmask_b32 define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i1 %a, i1 %b @@ -13,12 +13,16 @@ ret void } -; FUNC-LABEL: {{^}}s_minmax_i1: -; SI-DAG: buffer_load_ubyte [[COND:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; SI-DAG: buffer_load_ubyte [[A:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:45 -; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]] -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] +; GCN-LABEL: {{^}}s_minmax_i1: +; GCN: s_load_dword [[LOAD:s[0-9]+]], +; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8 +; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16 +; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] +; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] +; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1 +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]] +; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]] define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { %cmp = icmp slt i1 %cond, false %sel = select i1 %cmp, i1 %a, i1 %b Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -663,10 +663,10 @@ ; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16: ; GFX9: v_pk_add_u16 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} -; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_add_u16 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload @@ -702,11 +702,10 @@ ; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16: ; GFX9: v_pk_add_u16 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} - ; GFX9: v_pk_add_u16 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -14,14 +14,21 @@ ; VI: s_lshr_b32 ; VI: s_and_b32 ; VI: s_and_b32 -; SI: s_and_B32 -; SI: s_or_b32 +; VI: s_and_b32 +; VI: s_or_b32 + -; CI-DAG: v_lshlrev_b32_e32 -; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 +; CI: s_load_dword s +; CI: s_load_dword s +; CI: s_lshr_b32 +; CI: s_and_b32 +; CI: s_lshr_b32 +; CI: s_lshl_b32 +; CI: s_lshl_b32 +; CI: s_lshl_b32 +; CI: s_and_b32 +; CI: s_or_b32 +; CI: _store_dword define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -18,15 +18,24 @@ ; SI: s_and_b32 ; SI: s_or_b32 -; CI: v_sub_i32_e32 -; CI-DAG: v_sub_i32_e32 -; CI: v_bfe_i32 -; CI-DAG: v_bfe_i32 -; CI-DAG: v_add_i32_e32 -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 -; CI: v_add_i32_e32 -; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, -; CI: v_or_b32_e32 +; CI-NOT: {{buffer|flat}}_load +; CI: s_load_dword s +; CI-NOT: {{buffer|flat}}_load +; CI: s_lshr_b32 +; CI: s_ashr_i32 +; CI: s_sext_i32_i16 +; CI: s_sub_i32 +; CI: s_sub_i32 +; CI: s_sext_i32_i16 +; CI: s_sext_i32_i16 +; CI: s_max_i32 +; CI: s_max_i32 +; CI: s_lshl_b32 +; CI: s_add_i32 +; CI: s_add_i32 +; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff +; CI: s_or_b32 + define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %neg = sub <2 x i16> zeroinitializer, %val %cond = icmp sgt <2 x i16> %val, %neg Index: test/CodeGen/AMDGPU/widen-smrd-loads.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -0,0 +1,169 @@ +; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s + +; GCN-LABEL: {{^}}widen_i16_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_addk_i32 [[VAL]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4 +define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %add = add i16 %load, 999 + %or = or i16 %add, 4 + store i16 %or, i16 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_constant_load_zext_i32: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}} +; GCN: s_addk_i32 [[TRUNC]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4 +define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %ext = zext i16 %load to i32 + %add = add i32 %ext, 999 + %or = or i32 %add, 4 + store i32 %or, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_constant_load_sext_i32: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_sext_i32_i16 [[EXT:s[0-9]+]], [[VAL]] +; GCN: s_addk_i32 [[EXT]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[EXT]], 4 +define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %ext = sext i16 %load to i32 + %add = add i32 %ext, 999 + %or = or i32 %add, 4 + store i32 %or, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i17_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 34 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[ADD]], 4 +; GCN: s_bfe_u32 s{{[0-9]+}}, [[OR]], 0x10010 +define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { + %load = load i17, i17 addrspace(4)* %arg, align 4 + %add = add i17 %load, 34 + %or = or i17 %add, 4 + store i17 %or, i17 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_f16_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[VAL]] +; SI: v_add_f32_e32 [[ADD:v[0-9]+]], 4.0, [[CVT]] + +; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[VAL]], 4.0 +define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) { + %load = load half, half addrspace(4)* %arg, align 4 + %add = fadd half %load, 4.0 + store half %add, half addrspace(1)* null + ret void +} + +; FIXME: valu usage on VI +; GCN-LABEL: {{^}}widen_v2i8_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] + +; SI: s_add_i32 +; SI: s_or_b32 +; SI: s_addk_i32 +; SI: s_and_b32 +; SI: s_or_b32 +; SI: s_or_b32 + +; VI: s_add_i32 +; VI: v_add_u32_sdwa +; VI: v_or_b32_sdwa +; VI: v_or_b32_e32 +define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4 + %add = add <2 x i8> %load, + %or = or <2 x i8> %add, + store <2 x i8> %or, <2 x i8> addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}no_widen_i16_constant_divergent_load: +; GCN: {{buffer|flat}}_load_ushort +define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = zext i32 %tid to i64 + %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext + %load = load i16, i16 addrspace(4)* %gep.arg, align 4 + %add = add i16 %load, 999 + %or = or i16 %add, 4 + store i16 %or, i16 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i1_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 {{s[0-9]+}}, [[VAL]], 1{{$}} +define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) { + %load = load i1, i1 addrspace(4)* %arg, align 4 + %and = and i1 %load, true + store i1 %and, i1 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_zextload_i64_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}} +; GCN: s_addk_i32 [[TRUNC]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4 +define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %zext = zext i16 %load to i32 + %add = add i32 %zext, 999 + %or = or i32 %add, 4 + store i32 %or, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i1_zext_to_i64_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 [[AND:s[0-9]+]], [[VAL]], 1 +; GCN: s_add_u32 [[ADD:s[0-9]+]], [[AND]], 0x3e7 +; GCN: s_addc_u32 s{{[0-9]+}}, 0, 0 +define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) { + %load = load i1, i1 addrspace(4)* %arg, align 4 + %zext = zext i1 %load to i64 + %add = add i64 %zext, 999 + store i64 %add, i64 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_constant32_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_addk_i32 [[VAL]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4 +define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) { + %load = load i16, i16 addrspace(6)* %arg, align 4 + %add = add i16 %load, 999 + %or = or i16 %add, 4 + store i16 %or, i16 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_global_invariant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_addk_i32 [[VAL]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 1 +define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) { + %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0 + %add = add i16 %load, 999 + %or = or i16 %add, 1 + store i16 %or, i16 addrspace(1)* null + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +!0 = !{}