Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -66,6 +66,7 @@ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; protected: + SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -78,6 +79,7 @@ static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + static EVT getEquivalentBitType(LLVMContext &Context, EVT VT); virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -62,6 +62,14 @@ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -528,6 +536,9 @@ if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) return true; + if (LoadTy.getScalarType() == MVT::i32) + return false; + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); @@ -2153,54 +2164,100 @@ return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); } -static bool usesAllNormalStores(SDNode *LoadVal) { - for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { - if (!ISD::isNormalStore(*I)) - return false; +static bool hasVolatileUser(SDNode *Val) { + for (SDNode *U : Val->uses()) { + if (MemSDNode *M = dyn_cast(U)) { + if (M->isVolatile()) + return true; + } } - return true; + return false; } -// If we have a copy of an illegal type, replace it with a load / store of an -// equivalently sized legal type. This avoids intermediate bit pack / unpack -// instructions emitted when handling extloads and truncstores. Ideally we could -// recognize the pack / unpack pattern to eliminate it. +// Replace load of an illegal type with a store of a bitcast to a friendlier +// type. +SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + LoadSDNode *LN = cast(N); + if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) + return SDValue(); + + EVT VT = LN->getValueType(0); + if (isTypeLegal(VT) || !VT.isByteSized() || VT.getScalarType() == MVT::i32) + return SDValue(); + + // TODO: Is Size == 2 also preferable? + unsigned Size = VT.getStoreSize(); + if (Size < 4 || (Size > 4 && Size % 4 != 0)) + return SDValue(); + + unsigned Align = LN->getAlignment(); + if (Align < Size) { + bool IsFast; + if (!allowsMisalignedMemoryAccesses(VT, LN->getAddressSpace(), Align, &IsFast) || + !IsFast) { + return SDValue(); + } + } + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + + SDValue NewLoad + = DAG.getLoad(NewVT, SL, LN->getChain(), + LN->getBasePtr(), LN->getMemOperand()); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); + DCI.CombineTo(N, BC, NewLoad.getValue(1)); + return SDValue(N, 0); +} + +// Replace store of an illegal type with a store of a bitcast to a friendlier +// type. SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!DCI.isBeforeLegalize()) return SDValue(); StoreSDNode *SN = cast(N); - SDValue Value = SN->getValue(); - EVT VT = Value.getValueType(); + if (SN->isVolatile() || !ISD::isNormalStore(SN)) + return SDValue(); - if (isTypeLegal(VT) || SN->isVolatile() || - !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) + SDValue Val = SN->getValue(); + EVT VT = Val.getValueType(); + if (isTypeLegal(VT) || !VT.isByteSized()) return SDValue(); - LoadSDNode *LoadVal = cast(Value); - if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + unsigned Size = VT.getStoreSize(); + if (Size < 4 || (Size > 4 && Size % 4 != 0)) return SDValue(); - EVT MemVT = LoadVal->getMemoryVT(); + unsigned Align = SN->getAlignment(); + if (Align < Size) { + bool IsFast; + if (!allowsMisalignedMemoryAccesses(VT, SN->getAddressSpace(), Align, &IsFast) || + !IsFast) { + return SDValue(); + } + } SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); - - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - LoadVT, SL, - LoadVal->getChain(), - LoadVal->getBasePtr(), - LoadVal->getOffset(), - LoadVT, - LoadVal->getMemOperand()); + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); - SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); - DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + bool OtherUses = !Val.hasOneUse(); + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); + if (OtherUses) { + SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); + DAG.ReplaceAllUsesOfValueWith(Val, CastBack); + } - return DAG.getStore(SN->getChain(), SL, NewLoad, + return DAG.getStore(SN->getChain(), SL, CastVal, SN->getBasePtr(), SN->getMemOperand()); } @@ -2637,7 +2694,8 @@ break; } - + case ISD::LOAD: + return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2187,81 +2187,6 @@ } } - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getBuildVector(FloatVT, DL, Ops); - } - return SDValue(); } Index: test/CodeGen/AMDGPU/add.ll =================================================================== --- test/CodeGen/AMDGPU/add.ll +++ test/CodeGen/AMDGPU/add.ll @@ -123,12 +123,11 @@ ; SI: s_add_u32 ; SI: s_addc_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: @@ -145,12 +144,11 @@ ; FUNC-LABEL: {{^}}add64_sgpr_vgpr: ; SI-NOT: v_addc_u32_e32 s -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { entry: @@ -165,12 +163,11 @@ ; SI: s_add_u32 ; SI: s_addc_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: Index: test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/copy-illegal-type.ll +++ test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -54,31 +54,12 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte - -; After scalarizing v4i8 loads is fixed. -; XSI: buffer_load_dword -; XSI: V_BFE -; XSI: V_ADD -; XSI: V_ADD -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI: buffer_store_dword +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI: v_and_b32 +; SI: v_or_b32 +; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { @@ -90,34 +71,14 @@ } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte - -; XSI: buffer_load_dword -; XSI: BFE -; XSI: buffer_store_dword -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI-NEXT: buffer_store_dword - +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI-DAG: v_add_i32 +; SI-DAG: v_and_b32 +; SI-DAG: v_or_b32 +; SI-DAG: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 @@ -128,21 +89,50 @@ ret void } -; FUNC-LABEL: {{^}}test_copy_v3i8: -; SI-NOT: bfe -; SI-NOT: bfi +; FUNC-LABEL: {{^}}test_copy_v3i8_align4: +; SI: buffer_load_dword +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; SI: s_endpgm -define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void } +; FUNC-LABEL: {{^}}test_copy_v3i8_align2: +; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI: s_endpgm +define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v3i8_align1: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 + ret void +} + ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte +; SI: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -170,16 +170,15 @@ ; FIXME: Should not have extra add ; FUNC-LABEL: {{^}}v_ctpop_i128: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v[[VAL2]], 0 -; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] +; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 +; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 -; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v[[VAL1]], [[MIDRESULT2]] +; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 +; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] -; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]] +; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm Index: test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -14,13 +14,12 @@ ret void } +; XXX - Is ushort load better? ; SI-LABEL: {{^}}load_v2i8_to_v2f32: -; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI-NOT: and -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]] +; SI: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 @@ -30,11 +29,11 @@ } ; SI-LABEL: {{^}}load_v3i8_to_v3f32: -; SI-NOT: bfe +; SI: buffer_load_dword [[VAL:v[0-9]+]] ; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 @@ -83,26 +82,25 @@ ret void } -; XXX - This should really still be able to use the v_cvt_f32_ubyte0 -; for each component, but computeKnownBits doesn't handle vectors very -; well. - +; Instructions still emitted to repack bytes for add use. ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 - -; XXX - replace with this when v4i8 loads aren't scalarized anymore. -; XSI: buffer_load_dword -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 +; SI: buffer_load_dword +; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 +; SI-DAG: v_cvt_f32_ubyte3_e32 + +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 +; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 +; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, +; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, +; SI-DAG: v_add_i32 + +; SI: buffer_store_dwordx4 +; SI: buffer_store_dword + ; SI: s_endpgm define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; half args should be promoted to float @@ -15,8 +15,9 @@ ; GCN-LABEL: {{^}}load_v2f16_arg: ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN-DAG: buffer_store_short [[V0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_short [[V1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out @@ -42,10 +43,7 @@ ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out @@ -280,11 +278,11 @@ } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: -; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} +; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in @@ -318,22 +316,8 @@ } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -378,10 +362,10 @@ } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: -; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] +; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} @@ -455,8 +439,9 @@ ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: buffer_store_short [[CVT0]] -; GCN-DAG: buffer_store_short [[CVT1]] +; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] +; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] +; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %val = load <2 x float>, <2 x float> addrspace(1)* %in @@ -487,10 +472,7 @@ ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %val = load <4 x float>, <4 x float> addrspace(1)* %in @@ -510,14 +492,7 @@ ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx4 ; GCN: s_endpgm define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { %val = load <8 x float>, <8 x float> addrspace(1)* %in @@ -547,22 +522,8 @@ ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_dwordx4 +; GCN-DAG: buffer_store_dwordx4 ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in Index: test/CodeGen/AMDGPU/load.ll =================================================================== --- test/CodeGen/AMDGPU/load.ll +++ test/CodeGen/AMDGPU/load.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s ;===------------------------------------------------------------------------===; ; GLOBAL ADDRESS SPACE @@ -35,6 +35,30 @@ ret void } +; FUNC-LABEL: {{^}}load_i24: +; SI: {{flat|buffer}}_load_ubyte +; SI: {{flat|buffer}}_load_ushort +; SI: {{flat|buffer}}_store_dword +define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) { + %1 = load i24, i24 addrspace(1)* %in + %2 = zext i24 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i25: +; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOHSA: buffer_store_dword [[VAL]] + +; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] +; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]] +define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) { + %1 = load i25, i25 addrspace(1)* %in + %2 = zext i25 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}load_v2i8: ; R600: VTX_READ_8 ; R600: VTX_READ_8 @@ -75,17 +99,42 @@ ; R600: VTX_READ_8 ; R600: VTX_READ_8 ; R600: VTX_READ_8 +; SI-NOHSA: buffer_load_dword +; CI-HSA: flat_load_dword + +; SI-DAG: v_lshrrev_b32_e32 +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_bfe_u32 +; SI-DAG: v_bfe_u32 + +; SI-NOHSA: buffer_store_dword +; CI-HSA: flat_store_dword +define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %1 = zext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_align1: +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; R600: VTX_READ_8 + ; SI-NOHSA: buffer_load_ubyte ; SI-NOHSA: buffer_load_ubyte ; SI-NOHSA: buffer_load_ubyte ; SI-NOHSA: buffer_load_ubyte + ; CI-HSA: flat_load_ubyte ; CI-HSA: flat_load_ubyte ; CI-HSA: flat_load_ubyte ; CI-HSA: flat_load_ubyte -define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +define void @load_v4i8_align1(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { entry: - %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in + %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 %1 = zext <4 x i8> %0 to <4 x i32> store <4 x i32> %1, <4 x i32> addrspace(1)* %out ret void @@ -104,14 +153,16 @@ ; R600-DAG: 8 ; R600-DAG: 8 ; R600-DAG: 8 -; SI-NOHSA: buffer_load_sbyte -; SI-NOHSA: buffer_load_sbyte -; SI-NOHSA: buffer_load_sbyte -; SI-NOHSA: buffer_load_sbyte -; CI-HSA: flat_load_sbyte -; CI-HSA: flat_load_sbyte -; CI-HSA: flat_load_sbyte -; CI-HSA: flat_load_sbyte + +; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]] +; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_ashrrev_i32_e32 +; SI-DAG: v_bfe_i32 +; SI-DAG: v_bfe_i32 +; SI-DAG: v_bfe_i32 + +; SI-NOHSA: buffer_store_dwordx4 +; CI-HSA: flat_store_dwordx4 define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { entry: %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in @@ -150,10 +201,12 @@ ; FUNC-LABEL: {{^}}load_v2i16: ; R600: VTX_READ_16 ; R600: VTX_READ_16 -; SI-NOHSA: buffer_load_ushort -; SI-NOHSA: buffer_load_ushort -; CI-HSA: flat_load_ushort -; CI-HSA: flat_load_ushort + +; SI-NOHSA: buffer_load_dword +; CI-HSA: flat_load_dword + +; SI-NOHSA: buffer_store_dwordx2 +; CI-HSA: flat_store_dwordx2 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -169,10 +222,14 @@ ; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal ; R600-DAG: 16 ; R600-DAG: 16 -; SI-NOHSA: buffer_load_sshort -; SI-NOHSA: buffer_load_sshort -; CI-HSA: flat_load_sshort -; CI-HSA: flat_load_sshort +; SI-NOHSA: buffer_load_dword +; CI-HSA: flat_load_dword + +; SI-DAG: v_ashr +; SI-DAG: v_bfe_i32 + +; SI-NOHSA: buffer_store_dwordx2 +; CI-HSA: flat_store_dwordx2 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -186,14 +243,17 @@ ; R600: VTX_READ_16 ; R600: VTX_READ_16 ; R600: VTX_READ_16 -; SI-NOHSA: buffer_load_ushort -; SI-NOHSA: buffer_load_ushort -; SI-NOHSA: buffer_load_ushort -; SI-NOHSA: buffer_load_ushort -; CI-HSA: flat_load_ushort -; CI-HSA: flat_load_ushort -; CI-HSA: flat_load_ushort -; CI-HSA: flat_load_ushort + +; SI-NOHSA: buffer_load_dwordx2 +; CI-HSA: flat_load_dwordx2 + +; SI-DAG: v_lshrrev_b32 +; SI-DAG: v_lshrrev_b32 +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_and_b32_e32 + +; SI-NOHSA: buffer_store_dwordx4 +; CI-HSA: flat_store_dwordx4 define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in @@ -215,14 +275,17 @@ ; R600-DAG: 16 ; R600-DAG: 16 ; R600-DAG: 16 -; SI-NOHSA: buffer_load_sshort -; SI-NOHSA: buffer_load_sshort -; SI-NOHSA: buffer_load_sshort -; SI-NOHSA: buffer_load_sshort -; CI-HSA: flat_load_sshort -; CI-HSA: flat_load_sshort -; CI-HSA: flat_load_sshort -; CI-HSA: flat_load_sshort + +; SI-NOHSA: buffer_load_dwordx2 +; CI-HSA: flat_load_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_{{ashr|ashrrev}}_i64 +; SI-DAG: v_bfe_i32 +; SI-DAG: v_bfe_i32 + +; SI-NOHSA: buffer_store_dwordx4 +; CI-HSA: flat_store_dwordx4 define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { entry: %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in @@ -282,9 +345,9 @@ } ; FUNC-LABEL: {{^}}load_i64_sext: -; R600: MEM_RAT -; R600: MEM_RAT -; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; CM: MEM_RAT_CACHELESS STORE_DWORD +; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.y ; R600: 31 ; SI-NOHSA: buffer_load_dword ; CI-HSA: flat_load_dword @@ -298,8 +361,8 @@ } ; FUNC-LABEL: {{^}}load_i64_zext: -; R600: MEM_RAT -; R600: MEM_RAT +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; CM: MEM_RAT_CACHELESS STORE_DWORD define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { entry: %0 = load i32, i32 addrspace(1)* %in @@ -523,12 +586,10 @@ ; R600: LDS_UBYTE_READ_RET ; R600: LDS_UBYTE_READ_RET ; R600: LDS_UBYTE_READ_RET + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 +; SI: ds_read_b32 define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { entry: %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in @@ -546,12 +607,17 @@ ; R600-DAG: BFE_INT ; R600-DAG: BFE_INT ; R600-DAG: BFE_INT + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 -; SI: ds_read_i8 -; SI: ds_read_i8 -; SI: ds_read_i8 -; SI: ds_read_i8 +; SI: ds_read_b32 + +; SI-DAG: v_ashrrev_i32_e32 +; SI-DAG: v_bfe_i32 +; SI-DAG: v_bfe_i32 +; SI-DAG: v_bfe_i32 + +; SI: s_endpgm define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { entry: %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in @@ -577,6 +643,7 @@ ; FUNC-LABEL: {{^}}load_i16_sext_local: ; R600: LDS_USHORT_READ_RET ; R600: BFE_INT + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 ; SI: ds_read_i16 @@ -591,10 +658,15 @@ ; FUNC-LABEL: {{^}}load_v2i16_local: ; R600: LDS_USHORT_READ_RET ; R600: LDS_USHORT_READ_RET + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 -; SI: ds_read_u16 -; SI: ds_read_u16 +; SI: ds_read_b32 + +; SI-DAG: v_lshrrev_b32 +; SI-DAG: v_and_b32_e32 + +; SI: s_endpgm define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { entry: %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in @@ -608,10 +680,13 @@ ; R600-DAG: LDS_USHORT_READ_RET ; R600-DAG: BFE_INT ; R600-DAG: BFE_INT + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 -; SI: ds_read_i16 -; SI: ds_read_i16 +; SI: ds_read_b32 +; SI-DAG: v_ashrrev_i32_e32 +; SI-DAG: v_bfe_i32 +; SI: s_endpgm define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { entry: %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in @@ -625,12 +700,15 @@ ; R600: LDS_USHORT_READ_RET ; R600: LDS_USHORT_READ_RET ; R600: LDS_USHORT_READ_RET + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 +; SI: ds_read_b64 +; SI-DAG: v_lshrrev_b32_e32 +; SI-DAG: v_lshrrev_b32_e32 +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_and_b32_e32 +; SI: s_endpgm define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { entry: %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in @@ -648,12 +726,16 @@ ; R600-DAG: BFE_INT ; R600-DAG: BFE_INT ; R600-DAG: BFE_INT + ; SI-NOT: s_wqm_b64 ; SI: s_mov_b32 m0 -; SI: ds_read_i16 -; SI: ds_read_i16 -; SI: ds_read_i16 -; SI: ds_read_i16 +; SI: ds_read_b64 +; SI-DAG: v_ashrrev_i32_e32 +; SI-DAG: v_ashrrev_i32_e32 +; SI-DAG: v_bfe_i32 +; SI-DAG: v_bfe_i32 + +; SI: s_endpgm define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { entry: %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in Index: test/CodeGen/AMDGPU/scalar_to_vector.ll =================================================================== --- test/CodeGen/AMDGPU/scalar_to_vector.ll +++ test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,15 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - +; XXX - Why the packing? ; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm +; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] +; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] +; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] +; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] +; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> @@ -21,11 +20,7 @@ ; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm +; SI: buffer_store_dwordx2 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tmp1 = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> Index: test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- test/CodeGen/AMDGPU/sext-in-reg.ll +++ test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone @@ -95,17 +95,6 @@ ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 56 @@ -121,16 +110,6 @@ ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 48 @@ -145,17 +124,6 @@ ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG-NOT: BFE_INT - -; EG: ASHR [[RES_HI]] - -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 32 Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -5,11 +5,11 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -26,12 +26,12 @@ ; GCN-LABEL: {{^}}v_uextract_bit_63_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -48,12 +48,12 @@ ; GCN-LABEL: {{^}}v_uextract_bit_95_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -72,10 +72,10 @@ ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -90,18 +90,16 @@ ; Spans more than 2 dword boundaries ; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128: -; GCN: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: buffer_load_dword v[[VAL1:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[}}[[VAL2]]:[[VAL3]]{{\]}}, 30 -; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v[[VAL1]] +; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30 +; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}} ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] ; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ELT2PART]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/store-barrier.ll =================================================================== --- test/CodeGen/AMDGPU/store-barrier.ll +++ test/CodeGen/AMDGPU/store-barrier.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s ; This test is for a bug in the machine scheduler where stores without @@ -17,10 +17,10 @@ %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 - %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 + %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 1 %tmp16 = add i32 %tmp13, 1 %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 - store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 + store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 1 tail call void @llvm.amdgcn.s.barrier() %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 %tmp26 = sext i32 %tmp25 to i64 Index: test/CodeGen/AMDGPU/store.ll =================================================================== --- test/CodeGen/AMDGPU/store.ll +++ test/CodeGen/AMDGPU/store.ll @@ -77,6 +77,26 @@ ret void } +; FUNC-LABEL: {{^}}store_i24: +; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_short +define void @store_i24(i24 addrspace(1)* %out, i24 %in) { +entry: + store i24 %in, i24 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i25: +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} +; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] +; SI: buffer_store_dword [[VAND]] +define void @store_i25(i25 addrspace(1)* %out, i25 %in) { +entry: + store i25 %in, i25 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}store_v2i8: ; EG: MEM_RAT MSKOR ; EG-NOT: MEM_RAT MSKOR @@ -96,8 +116,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD -; SI: buffer_store_short -; SI: buffer_store_short +; SI: buffer_store_dword define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> @@ -110,10 +129,7 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dword define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> @@ -135,17 +151,9 @@ } ; FUNC-LABEL: {{^}}store_v4i16: -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR +; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI-NOT: buffer_store_byte +; SI: buffer_store_dwordx2 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> @@ -239,8 +247,7 @@ ; CM: LDS_WRITE -; SI: ds_write_b16 -; SI: ds_write_b16 +; SI: ds_write_b32 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(3)* %out @@ -252,10 +259,7 @@ ; CM: LDS_WRITE -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 +; SI: ds_write_b32 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out Index: test/CodeGen/AMDGPU/sub.ll =================================================================== --- test/CodeGen/AMDGPU/sub.ll +++ test/CodeGen/AMDGPU/sub.ll @@ -58,13 +58,11 @@ ; SI: s_sub_u32 ; SI: s_subb_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: SUB_INT {{[* ]*}} ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB +; EG-DAG: SUB_INT {{[* ]*}} define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { %result = sub i64 %a, %b store i64 %result, i64 addrspace(1)* %out, align 8 @@ -75,13 +73,11 @@ ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: SUB_INT {{[* ]*}} ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB +; EG-DAG: SUB_INT {{[* ]*}} define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid @@ -110,13 +106,13 @@ } ; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -46,9 +46,8 @@ ret void } -; FIXME: Don't want load width reduced here. ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: -; CHECK: buffer_load_ushort [[VAL:v[0-9]+]] +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_short [[VAL]] define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in Index: test/CodeGen/AMDGPU/trunc-store.ll =================================================================== --- test/CodeGen/AMDGPU/trunc-store.ll +++ test/CodeGen/AMDGPU/trunc-store.ll @@ -2,22 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8: -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dwordx4 define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { %trunc = trunc <16 x i32> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out @@ -25,22 +10,7 @@ } ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8: -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dwordx4 define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { %trunc = trunc <16 x i64> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out