Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -66,6 +66,8 @@ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; protected: + bool shouldCombineMemoryType(const MemSDNode *M) const; + SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -78,6 +80,7 @@ static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + static EVT getEquivalentBitType(LLVMContext &Context, EVT VT); virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; @@ -138,7 +141,7 @@ ISD::LoadExtType ExtType, EVT ExtVT) const override; - bool isLoadBitCastBeneficial(EVT, EVT) const override; + bool isLoadBitCastBeneficial(EVT, EVT) const final; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -62,6 +62,14 @@ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -533,15 +541,17 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { - if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) - return true; - unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); - unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); + + if (LoadTy.getScalarType() == MVT::i32) + return false; + + unsigned LScalarSize = LoadTy.getScalarSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarSizeInBits(); - return ((LScalarSize <= CastScalarSize) || - (CastScalarSize >= 32) || - (LScalarSize < 32)); + return (LScalarSize < CastScalarSize) || + (CastScalarSize >= 32); } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -2161,56 +2171,105 @@ return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); } -static bool usesAllNormalStores(SDNode *LoadVal) { - for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { - if (!ISD::isNormalStore(*I)) +static bool hasVolatileUser(SDNode *Val) { + for (SDNode *U : Val->uses()) { + if (MemSDNode *M = dyn_cast(U)) { + if (M->isVolatile()) + return true; + } + } + + return false; +} + +bool AMDGPUTargetLowering::shouldCombineMemoryType(const MemSDNode *M) const { + EVT VT = M->getMemoryVT(); + + // i32 vectors are the canonical memory type. + if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) + return false; + + + if (!VT.isByteSized()) + return false; + + unsigned Size = VT.getStoreSize(); + + if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) + return false; + + if (Size == 3 || (Size > 4 && (Size % 4 != 0))) + return false; + + unsigned Align = M->getAlignment(); + if (Align < Size) { + bool IsFast; + if (!allowsMisalignedMemoryAccesses(VT, M->getAddressSpace(), Align, &IsFast) || + !IsFast) { return false; + } } return true; } -// If we have a copy of an illegal type, replace it with a load / store of an -// equivalently sized legal type. This avoids intermediate bit pack / unpack -// instructions emitted when handling extloads and truncstores. Ideally we could -// recognize the pack / unpack pattern to eliminate it. +// Replace load of an illegal type with a store of a bitcast to a friendlier +// type. +SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + LoadSDNode *LN = cast(N); + if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) + return SDValue(); + + if (!shouldCombineMemoryType(LN)) + return SDValue(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = LN->getMemoryVT(); + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + + SDValue NewLoad + = DAG.getLoad(NewVT, SL, LN->getChain(), + LN->getBasePtr(), LN->getMemOperand()); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); + DCI.CombineTo(N, BC, NewLoad.getValue(1)); + return SDValue(N, 0); +} + +// Replace store of an illegal type with a store of a bitcast to a friendlier +// type. SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!DCI.isBeforeLegalize()) return SDValue(); StoreSDNode *SN = cast(N); - SDValue Value = SN->getValue(); - EVT VT = Value.getValueType(); - - if (isTypeLegal(VT) || SN->isVolatile() || - !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) + if (SN->isVolatile() || !ISD::isNormalStore(SN)) return SDValue(); - LoadSDNode *LoadVal = cast(Value); - if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + if (!shouldCombineMemoryType(SN)) return SDValue(); - EVT MemVT = LoadVal->getMemoryVT(); - if (!MemVT.isRound()) - return SDValue(); + SDValue Val = SN->getValue(); + EVT VT = SN->getMemoryVT(); SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - LoadVT, SL, - LoadVal->getChain(), - LoadVal->getBasePtr(), - LoadVal->getOffset(), - LoadVT, - LoadVal->getMemOperand()); - - SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); - DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + bool OtherUses = !Val.hasOneUse(); + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); + if (OtherUses) { + SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); + DAG.ReplaceAllUsesOfValueWith(Val, CastBack); + } - return DAG.getStore(SN->getChain(), SL, NewLoad, + return DAG.getStore(SN->getChain(), SL, CastVal, SN->getBasePtr(), SN->getMemOperand()); } @@ -2647,7 +2706,8 @@ break; } - + case ISD::LOAD: + return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2194,81 +2194,6 @@ } } - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getBuildVector(FloatVT, DL, Ops); - } - return SDValue(); } @@ -2724,6 +2649,7 @@ unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; SDValue Src = N->getOperand(0); + // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. if (Src.getOpcode() == ISD::SRL) { // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -123,12 +123,11 @@ ; SI: s_add_u32 ; SI: s_addc_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: @@ -145,12 +144,11 @@ ; FUNC-LABEL: {{^}}add64_sgpr_vgpr: ; SI-NOT: v_addc_u32_e32 s -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { entry: @@ -165,12 +163,11 @@ ; SI: s_add_u32 ; SI: s_addc_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: Index: test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/copy-illegal-type.ll +++ test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -54,31 +54,12 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte - -; After scalarizing v4i8 loads is fixed. -; XSI: buffer_load_dword -; XSI: V_BFE -; XSI: V_ADD -; XSI: V_ADD -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI: buffer_store_dword +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI: v_and_b32 +; SI: v_or_b32 +; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { @@ -90,34 +71,14 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte - -; XSI: buffer_load_dword -; XSI: BFE -; XSI: buffer_store_dword -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI-NEXT: buffer_store_dword - +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI-DAG: v_add_i32 +; SI-DAG: v_and_b32 +; SI-DAG: v_or_b32 +; SI-DAG: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 @@ -128,21 +89,50 @@ ret void } -; FUNC-LABEL: {{^}}test_copy_v3i8: -; SI-NOT: bfe -; SI-NOT: bfi +; FUNC-LABEL: {{^}}test_copy_v3i8_align4: +; SI: buffer_load_dword +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; SI: s_endpgm -define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void } +; FUNC-LABEL: {{^}}test_copy_v3i8_align2: +; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI: s_endpgm +define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v3i8_align1: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 + ret void +} + ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte +; SI: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -170,16 +170,15 @@ ; FIXME: Should not have extra add ; FUNC-LABEL: {{^}}v_ctpop_i128: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v[[VAL2]], 0 -; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] +; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 +; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 -; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v[[VAL1]], [[MIDRESULT2]] +; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 +; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] -; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]] +; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -15,12 +15,9 @@ } ; SI-LABEL: {{^}}load_v2i8_to_v2f32: -; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI-NOT: and -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_load_ushort [[LD:v[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 @@ -30,11 +27,11 @@ } ; SI-LABEL: {{^}}load_v3i8_to_v3f32: -; SI-NOT: bfe +; SI: buffer_load_dword [[VAL:v[0-9]+]] ; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 @@ -83,26 +80,25 @@ ret void } -; XXX - This should really still be able to use the v_cvt_f32_ubyte0 -; for each component, but computeKnownBits doesn't handle vectors very -; well. - +; Instructions still emitted to repack bytes for add use. ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 - -; XXX - replace with this when v4i8 loads aren't scalarized anymore. -; XSI: buffer_load_dword -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 +; SI: buffer_load_dword +; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 +; SI-DAG: v_cvt_f32_ubyte3_e32 + +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 +; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 +; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, +; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, +; SI-DAG: v_add_i32 + +; SI: buffer_store_dwordx4 +; SI: buffer_store_dword + ; SI: s_endpgm define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; half args should be promoted to float @@ -15,8 +15,9 @@ ; GCN-LABEL: {{^}}load_v2f16_arg: ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN-DAG: buffer_store_short [[V0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_short [[V1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out @@ -42,10 +43,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out @@ -280,11 +278,11 @@ } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: -; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} +; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in @@ -318,22 +316,8 @@ } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -378,10 +362,10 @@ } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: -; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] +; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} @@ -455,8 +439,9 @@ ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: buffer_store_short [[CVT0]] -; GCN-DAG: buffer_store_short [[CVT1]] +; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] +; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] +; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %val = load <2 x float>, <2 x float> addrspace(1)* %in @@ -487,10 +472,7 @@ ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %val = load <4 x float>, <4 x float> addrspace(1)* %in @@ -510,14 +492,7 @@ ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx4 ; GCN: s_endpgm define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { %val = load <8 x float>, <8 x float> addrspace(1)* %in @@ -547,22 +522,8 @@ ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_dwordx4 +; GCN-DAG: buffer_store_dwordx4 ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -208,10 +208,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short v{{[0-9]+}}, off -; GCN: buffer_store_short v{{[0-9]+}}, off -; GCN: buffer_store_short v{{[0-9]+}}, off -; GCN: buffer_store_short v{{[0-9]+}}, off +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { %vecins = insertelement <4 x i16> %a, i16 5, i32 %b store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8 @@ -230,8 +227,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off +; GCN: buffer_store_short v{{[0-9]+}}, off define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 @@ -279,10 +275,7 @@ ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off +; GCN: buffer_store_dword v{{[0-9]+}}, off define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -125,10 +125,9 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dword s +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}} +; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = zext <2 x i16> %load to <2 x i32> @@ -137,11 +136,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dword s +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -177,15 +174,9 @@ } ; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx2 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -199,15 +190,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dwordx2 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -229,23 +214,9 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx4 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -254,23 +225,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dwordx4 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -279,39 +236,9 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx8 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -320,6 +247,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32: +; GCN: s_load_dwordx8 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = sext <16 x i16> %load to <16 x i32> @@ -328,71 +258,10 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -401,71 +270,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dwordx16 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -474,135 +281,8 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx16 +; GCN: s_load_dwordx16 define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(2)* %in %ext = zext <64 x i16> %load to <64 x i32> Index: test/CodeGen/AMDGPU/load-constant-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i32.ll +++ test/CodeGen/AMDGPU/load-constant-i32.ll @@ -78,8 +78,9 @@ ; GCN-DAG: v_mov_b32_e32 v[[SHI:[0-9]+]], 0{{$}} ; GCN: store_dwordx2 -; EG: MEM_RAT -; EG: MEM_RAT +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG: CF_END +; EG: VTX_READ_32 define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in %ext = zext i32 %ld to i64 @@ -92,9 +93,10 @@ ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[SLO]], 31 ; GCN: store_dwordx2 -; EG: MEM_RAT -; EG: MEM_RAT -; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG: CF_END +; EG: VTX_READ_32 +; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in Index: test/CodeGen/AMDGPU/load-constant-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i64.ll +++ test/CodeGen/AMDGPU/load-constant-i64.ll @@ -25,16 +25,10 @@ } ; FUNC-LABEL: {{^}}constant_load_v3i64: -; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; SI-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4{{$}} -; VI-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x10{{$}} +; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 +; EG-DAG: VTX_READ_128 +; EG-DAG: VTX_READ_128 define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in Index: test/CodeGen/AMDGPU/load-constant-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i8.ll +++ test/CodeGen/AMDGPU/load-constant-i8.ll @@ -115,10 +115,8 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN-NOHSA: buffer_load_ushort +; GCN-HSA: flat_load_ushort ; EG: VTX_READ_8 ; EG: VTX_READ_8 define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { @@ -129,10 +127,12 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN-NOHSA: buffer_load_ushort + +; GCN-HSA: flat_load_ushort + +; GCN: v_bfe_i32 +; GCN: v_bfe_i32 ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -176,14 +176,9 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN: s_load_dword s +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -197,14 +192,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN: s_load_dword s +; GCN-DAG: s_sext_i32_i8 +; GCN-DAG: s_ashr_i32 ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -226,23 +216,9 @@ } ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte - -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN: s_load_dwordx2 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i32> @@ -251,23 +227,9 @@ } ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte - -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN: s_load_dwordx2 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i8 define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i32> Index: test/CodeGen/AMDGPU/load-global-f64.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f64.ll +++ test/CodeGen/AMDGPU/load-global-f64.ll @@ -14,21 +14,21 @@ ret void } -; FUNC-LABEL: {{^}}global_load_v2i64: +; FUNC-LABEL: {{^}}global_load_v2f64: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 { +define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { entry: - %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in - store <2 x i64> %ld, <2 x i64> addrspace(1)* %out + %ld = load <2 x double>, <2 x double> addrspace(1)* %in + store <2 x double> %ld, <2 x double> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v3f64: -; GCN-NOHSA-DAG: buffer_load_dwordx4 -; GCN-NOHSA-DAG: buffer_load_dwordx2 -; GCN-HSA-DAG: flat_load_dwordx4 -; GCN-HSA-DAG: flat_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(1)* %in Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -136,10 +136,8 @@ } ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: flat_load_dword define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> @@ -148,11 +146,9 @@ } ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort +; GCN-NOHSA: buffer_load_dword -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-HSA: flat_load_dword ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -190,15 +186,9 @@ } ; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx2 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -212,15 +202,9 @@ } ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-HSA: flat_load_dwordx2 ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -242,23 +226,8 @@ } ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -267,23 +236,8 @@ } ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -292,39 +246,11 @@ } ; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -341,71 +267,15 @@ } ; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -414,71 +284,15 @@ } ; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -487,135 +301,23 @@ } ; FUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> Index: test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i32.ll +++ test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,9 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s - ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: @@ -99,8 +97,7 @@ ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] -; EG: MEM_RAT -; EG: MEM_RAT +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = zext i32 %ld to i64 @@ -115,9 +112,10 @@ ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} + ; EG: MEM_RAT -; EG: MEM_RAT -; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; EG: VTX_READ_32 +; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in Index: test/CodeGen/AMDGPU/load-global-i64.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i64.ll +++ test/CodeGen/AMDGPU/load-global-i64.ll @@ -32,17 +32,14 @@ } ; FUNC-LABEL: {{^}}global_load_v3i64: -; GCN-NOHSA-DAG: buffer_load_dwordx4 -; GCN-NOHSA-DAG: buffer_load_dwordx2 -; GCN-HSA-DAG: flat_load_dwordx4 -; GCN-HSA-DAG: flat_load_dwordx2 - -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 + +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 + +; EG: VTX_READ_128 +; EG: VTX_READ_128 define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in Index: test/CodeGen/AMDGPU/load-global-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i8.ll +++ test/CodeGen/AMDGPU/load-global-i8.ll @@ -121,10 +121,9 @@ } ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN-NOHSA: buffer_load_ushort +; GCN-HSA: flat_load_ushort + ; EG: VTX_READ_8 ; EG: VTX_READ_8 define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { @@ -135,10 +134,8 @@ } ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN-NOHSA: buffer_load_ushort +; GCN-HSA: flat_load_ushort ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -184,14 +181,8 @@ } ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: flat_load_dword ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -205,14 +196,8 @@ } ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: flat_load_dword ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] Index: test/CodeGen/AMDGPU/load-local-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-f32.ll +++ test/CodeGen/AMDGPU/load-local-f32.ll @@ -27,9 +27,10 @@ ret void } -; FIXME: should only do one b64 load +; FIXME: should this do a read2_b64? ; FUNC-LABEL: {{^}}local_load_v3f32: -; GCN: ds_read2_b64 +; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} ; GCN: s_waitcnt ; GCN-DAG: ds_write_b64 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8{{$}} Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -132,8 +132,7 @@ ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read_b32 ; EG: LDS_USHORT_READ_RET ; EG: LDS_USHORT_READ_RET @@ -147,8 +146,7 @@ ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN: ds_read_b32 ; EG-DAG: LDS_USHORT_READ_RET ; EG-DAG: LDS_USHORT_READ_RET @@ -188,10 +186,7 @@ ; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read_b64 ; EG: LDS_USHORT_READ_RET ; EG: LDS_USHORT_READ_RET @@ -207,10 +202,7 @@ ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN: ds_read_b64 ; EG-DAG: LDS_USHORT_READ_RET ; EG-DAG: LDS_USHORT_READ_RET @@ -228,14 +220,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -244,14 +229,7 @@ } ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32: -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -260,22 +238,8 @@ } ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -284,22 +248,9 @@ } ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:1{{$}} +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i32> @@ -308,39 +259,10 @@ } ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 - +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -349,38 +271,10 @@ } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -389,71 +283,14 @@ } ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 - +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:15 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:12 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:13 offset1:14 define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = zext <64 x i16> %load to <64 x i32> Index: test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i8.ll +++ test/CodeGen/AMDGPU/load-local-i8.ll @@ -122,8 +122,7 @@ } ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32: -; GCN: ds_read_u8 -; GCN: ds_read_u8 +; GCN: ds_read_u16 ; EG: LDS_UBYTE_READ_RET ; EG: LDS_UBYTE_READ_RET @@ -137,8 +136,9 @@ ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i8 -; GCN: ds_read_i8 +; GCN: ds_read_u16 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; EG-DAG: LDS_UBYTE_READ_RET ; EG-DAG: LDS_UBYTE_READ_RET @@ -189,10 +189,7 @@ ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 +; GCN: ds_read_b32 ; EG: LDS_UBYTE_READ_RET ; EG: LDS_UBYTE_READ_RET @@ -208,10 +205,7 @@ ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i8 -; GCN: ds_read_i8 -; GCN: ds_read_i8 -; GCN: ds_read_i8 +; GCN: ds_read_b32 ; EG-DAG: LDS_UBYTE_READ_RET ; EG-DAG: LDS_UBYTE_READ_RET Index: test/CodeGen/AMDGPU/load-weird-sizes.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}load_i24: +; SI: {{flat|buffer}}_load_ubyte +; SI: {{flat|buffer}}_load_ushort +; SI: {{flat|buffer}}_store_dword +define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { + %1 = load i24, i24 addrspace(1)* %in + %2 = zext i24 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i25: +; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOHSA: buffer_store_dword [[VAL]] + +; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] +; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]] +define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 { + %1 = load i25, i25 addrspace(1)* %in + %2 = zext i25 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/scalar_to_vector.ll =================================================================== --- test/CodeGen/AMDGPU/scalar_to_vector.ll +++ test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,15 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - +; XXX - Why the packing? ; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm +; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] +; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] +; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] +; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] +; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> @@ -21,11 +20,7 @@ ; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm +; SI: buffer_store_dwordx2 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tmp1 = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone @@ -95,17 +95,6 @@ ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 56 @@ -121,16 +110,6 @@ ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 48 @@ -145,17 +124,6 @@ ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG-NOT: BFE_INT - -; EG: ASHR [[RES_HI]] - -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 32 Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -5,11 +5,11 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -26,12 +26,12 @@ ; GCN-LABEL: {{^}}v_uextract_bit_63_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -48,12 +48,12 @@ ; GCN-LABEL: {{^}}v_uextract_bit_95_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -72,10 +72,10 @@ ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -90,18 +90,16 @@ ; Spans more than 2 dword boundaries ; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128: -; GCN: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: buffer_load_dword v[[VAL1:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[}}[[VAL2]]:[[VAL3]]{{\]}}, 30 -; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v[[VAL1]] +; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30 +; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}} ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] ; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ELT2PART]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/store-barrier.ll =================================================================== --- test/CodeGen/AMDGPU/store-barrier.ll +++ test/CodeGen/AMDGPU/store-barrier.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s ; This test is for a bug in the machine scheduler where stores without @@ -17,10 +17,10 @@ %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 - %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 + %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 1 %tmp16 = add i32 %tmp13, 1 %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 - store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 + store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 1 tail call void @llvm.amdgcn.s.barrier() %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 %tmp26 = sext i32 %tmp25 to i64 Index: test/CodeGen/AMDGPU/store.ll =================================================================== --- test/CodeGen/AMDGPU/store.ll +++ test/CodeGen/AMDGPU/store.ll @@ -77,12 +77,31 @@ ret void } +; FUNC-LABEL: {{^}}store_i24: +; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_short +define void @store_i24(i24 addrspace(1)* %out, i24 %in) { +entry: + store i24 %in, i24 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i25: +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} +; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] +; SI: buffer_store_dword [[VAND]] +define void @store_i25(i25 addrspace(1)* %out, i25 %in) { +entry: + store i25 %in, i25 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}store_v2i8: ; EG: MEM_RAT MSKOR ; EG-NOT: MEM_RAT MSKOR -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_short define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> @@ -96,8 +115,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD -; SI: buffer_store_short -; SI: buffer_store_short +; SI: buffer_store_dword define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> @@ -110,10 +128,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dword define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> @@ -135,17 +150,9 @@ } ; FUNC-LABEL: {{^}}store_v4i16: -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR +; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI-NOT: buffer_store_byte +; SI: buffer_store_dwordx2 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> @@ -239,8 +246,7 @@ ; CM: LDS_WRITE -; SI: ds_write_b16 -; SI: ds_write_b16 +; SI: ds_write_b32 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(3)* %out @@ -252,10 +258,7 @@ ; CM: LDS_WRITE -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 +; SI: ds_write_b32 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -58,13 +58,11 @@ ; SI: s_sub_u32 ; SI: s_subb_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: SUB_INT {{[* ]*}} ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB +; EG-DAG: SUB_INT {{[* ]*}} define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { %result = sub i64 %a, %b store i64 %result, i64 addrspace(1)* %out, align 8 @@ -75,13 +73,11 @@ ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: SUB_INT {{[* ]*}} ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB +; EG-DAG: SUB_INT {{[* ]*}} define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid @@ -110,13 +106,13 @@ } ; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -46,9 +46,8 @@ ret void } -; FIXME: Don't want load width reduced here. ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: -; CHECK: buffer_load_ushort [[VAL:v[0-9]+]] +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_short [[VAL]] define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in Index: test/CodeGen/AMDGPU/trunc-store.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store.ll +++ test/CodeGen/AMDGPU/trunc-store.ll @@ -2,22 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8: -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dwordx4 define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { %trunc = trunc <16 x i32> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out @@ -25,22 +10,7 @@ } ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8: -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dwordx4 define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { %trunc = trunc <16 x i64> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out