Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -110,9 +110,10 @@ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, + CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>, CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>> Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -540,6 +540,8 @@ return AMDGPU::SReg_32_XM0RegClassID; case 2: return AMDGPU::SReg_64RegClassID; + case 3: + return AMDGPU::SGPR_96RegClassID; case 4: return AMDGPU::SReg_128RegClassID; case 8: Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -110,9 +110,23 @@ SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; + /// Split a vector type into two parts. The first part is a power of two + /// vector. The second part is whatever is left over, and is a scalar if it + /// would otherwise be a 1-vector. + std::pair getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const; + + /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be + /// scalar. + std::pair splitVector(const SDValue &N, const SDLoc &DL, + const EVT &LoVT, const EVT &HighVT, + SelectionDAG &DAG) const; + /// Split a vector load into 2 loads of half the vector. SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Widen a vector load from vec3 to vec4. + SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -150,6 +150,9 @@ setOperationAction(ISD::LOAD, MVT::v2f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v3f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); @@ -235,6 +238,9 @@ setOperationAction(ISD::STORE, MVT::v2f32, Promote); AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v3f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -325,12 +331,16 @@ // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); @@ -392,7 +402,7 @@ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v4i32 + MVT::v2i32, MVT::v3i32, MVT::v4i32 }; for (MVT VT : VectorIntTypes) { @@ -434,7 +444,7 @@ } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v4f32 + MVT::v2f32, MVT::v3f32, MVT::v4f32 }; for (MVT VT : FloatVectorTypes) { @@ -476,6 +486,9 @@ setOperationAction(ISD::SELECT, MVT::v2f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::SELECT, MVT::v3f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); + setOperationAction(ISD::SELECT, MVT::v4f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); @@ -1372,6 +1385,41 @@ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); } +// Split a vector type into two parts. The first part is a power of two vector. +// The second part is whatever is left over, and is a scalar if it would +// otherwise be a 1-vector. +std::pair +AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { + EVT LoVT, HiVT; + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); + LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); + HiVT = NumElts - LoNumElts == 1 + ? EltVT + : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); + return std::make_pair(LoVT, HiVT); +} + +// Split a vector value into two parts of types LoVT and HiVT. HiVT could be +// scalar. +std::pair +AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, + const EVT &LoVT, const EVT &HiVT, + SelectionDAG &DAG) const { + assert(LoVT.getVectorNumElements() + + (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= + N.getValueType().getVectorNumElements() && + "More vector elements requested than available!"); + auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, + DAG.getConstant(0, DL, IdxTy)); + SDValue Hi = DAG.getNode( + HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, + HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy)); + return std::make_pair(Lo, Hi); +} + SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast(Op); @@ -1393,9 +1441,9 @@ EVT LoMemVT, HiMemVT; SDValue Lo, Hi; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); + std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); + std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); unsigned Size = LoMemVT.getStoreSize(); unsigned BaseAlign = Load->getAlignment(); @@ -1410,15 +1458,52 @@ HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); - SDValue Ops[] = { - DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - LoLoad.getValue(1), HiLoad.getValue(1)) - }; + auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); + SDValue Join; + if (LoVT == HiVT) { + // This is the case that the vector is power of two so was evenly split. + Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); + } else { + Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, + DAG.getConstant(0, SL, IdxTy)); + Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR + : ISD::INSERT_VECTOR_ELT, + SL, VT, Join, HiLoad, + DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy)); + } + + SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1))}; return DAG.getMergeValues(Ops, SL); } +// Widen a vector load from vec3 to vec4. +SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast(Op); + EVT VT = Op.getValueType(); + assert(VT.getVectorNumElements() == 3); + SDValue BasePtr = Load->getBasePtr(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Load->getAlignment(); + + EVT WideVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); + EVT WideMemVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); + SDValue WideLoad = DAG.getExtLoad( + Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, + WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); + return DAG.getMergeValues( + {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, + DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))), + WideLoad.getValue(1)}, + SL); +} + SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast(Op); @@ -1439,9 +1524,9 @@ EVT LoMemVT, HiMemVT; SDValue Lo, Hi; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); + std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); + std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); Index: llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/BUFInstructions.td @@ -821,7 +821,7 @@ "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load >; defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", VReg_96, untyped, mubuf_load + "buffer_load_dwordx3", VReg_96, v3i32, mubuf_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load @@ -855,7 +855,7 @@ "buffer_store_dwordx2", VReg_64, v2i32, store_global >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", VReg_96, untyped, store_global + "buffer_store_dwordx3", VReg_96, v3i32, store_global >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < "buffer_store_dwordx4", VReg_128, v4i32, store_global @@ -1045,9 +1045,6 @@ //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// -// Remaining instructions: -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol>; @@ -1405,6 +1402,7 @@ defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; +defm : MUBUFScratchLoadPat ; defm : MUBUFScratchLoadPat ; let OtherPredicates = [D16PreservesUnusedBits] in { @@ -1477,6 +1475,7 @@ defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; +defm : MUBUFScratchStorePat ; defm : MUBUFScratchStorePat ; Index: llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/FLATInstructions.td @@ -730,6 +730,7 @@ def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadAtomicPat ; @@ -739,6 +740,7 @@ def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; +def : FlatStorePat ; def : FlatStorePat ; def : FlatStoreAtomicPat ; @@ -808,6 +810,7 @@ def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; +def : FlatLoadSignedPat ; def : FlatLoadSignedPat ; def : FlatLoadAtomicPat ; @@ -819,6 +822,7 @@ def : FlatStoreSignedPat ; def : FlatStoreSignedPat ; def : FlatStoreSignedPat ; +def : FlatStoreSignedPat ; def : FlatStoreSignedPat ; let OtherPredicates = [D16PreservesUnusedBits] in { Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -123,6 +123,9 @@ addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); + addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); @@ -150,6 +153,7 @@ // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); @@ -157,6 +161,7 @@ setOperationAction(ISD::LOAD, MVT::v32i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); @@ -325,6 +330,12 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + // Deal with vec3 vector operations when widened to vec4. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -1328,6 +1339,17 @@ const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg) const { + // First, if it is a widened vector, narrow it. + if (VT.isVector() && + VT.getVectorNumElements() != MemVT.getVectorNumElements()) { + EVT NarrowedVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), + VT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, + DAG.getConstant(0, SL, MVT::i32)); + } + + // Then convert the vector elements or scalar value. if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -6546,8 +6568,25 @@ SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, RealMemVT, MMO); + if (!MemVT.isVector()) { + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + SmallVector Elts; + for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { + SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, + DAG.getConstant(I, DL, MVT::i32)); + + Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); + } + SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1) }; @@ -6581,8 +6620,13 @@ if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) - return SDValue(); + if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6594,8 +6638,13 @@ AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && - Alignment >= 4 && NumElements < 32) - return SDValue(); + Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6607,7 +6656,10 @@ AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); - // v4 loads are supported for private and global memory. + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); + // v3 and v4 loads are supported for private and global memory. return SDValue(); } if (AS == AMDGPUAS::PRIVATE_ADDRESS) { @@ -6625,6 +6677,9 @@ // Same as global/flat if (NumElements > 4) return SplitVectorLoad(Op, DAG); + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); @@ -7026,6 +7081,9 @@ AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); + // v3 stores not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return SplitVectorStore(Op, DAG); return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -7036,7 +7094,7 @@ return SplitVectorStore(Op, DAG); return SDValue(); case 16: - if (NumElements > 4) + if (NumElements > 4 || NumElements == 3) return SplitVectorStore(Op, DAG); return SDValue(); default: @@ -7045,7 +7103,7 @@ } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && - VT.getStoreSize() == 16) + VT.getStoreSize() == 16 && NumElements != 3) return SDValue(); if (NumElements > 2) @@ -9624,6 +9682,9 @@ case 64: RC = &AMDGPU::SGPR_64RegClass; break; + case 96: + RC = &AMDGPU::SReg_96RegClass; + break; case 128: RC = &AMDGPU::SReg_128RegClass; break; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -562,7 +562,8 @@ unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { - if (RI.getRegSizeInBits(*RC) > 32) { + // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32. + if (!(RI.getRegSizeInBits(*RC) % 64)) { Opcode = AMDGPU::S_MOV_B64; EltSize = 8; } else { @@ -840,6 +841,8 @@ return AMDGPU::SI_SPILL_S32_SAVE; case 8: return AMDGPU::SI_SPILL_S64_SAVE; + case 12: + return AMDGPU::SI_SPILL_S96_SAVE; case 16: return AMDGPU::SI_SPILL_S128_SAVE; case 32: @@ -942,6 +945,8 @@ return AMDGPU::SI_SPILL_S32_RESTORE; case 8: return AMDGPU::SI_SPILL_S64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_S96_RESTORE; case 16: return AMDGPU::SI_SPILL_S128_RESTORE; case 32: @@ -1916,14 +1921,18 @@ const int16_t *SubIndices = Sub0_15; int NElts = DstSize / 32; - // 64-bit select is only avaialble for SALU. + // 64-bit select is only available for SALU. + // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. if (Pred == SCC_TRUE) { - SelOp = AMDGPU::S_CSELECT_B64; - EltRC = &AMDGPU::SGPR_64RegClass; - SubIndices = Sub0_15_64; - - assert(NElts % 2 == 0); - NElts /= 2; + if (NElts % 2) { + SelOp = AMDGPU::S_CSELECT_B32; + EltRC = &AMDGPU::SGPR_32RegClass; + } else { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + NElts /= 2; + } } MachineInstrBuilder MIB = BuildMI( Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -479,6 +479,7 @@ // SI_SPILL_32_* instructions. defm SI_SPILL_S32 : SI_SPILL_SGPR ; defm SI_SPILL_S64 : SI_SPILL_SGPR ; +defm SI_SPILL_S96 : SI_SPILL_SGPR ; defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; @@ -738,6 +739,22 @@ >; } +foreach Index = 0-2 in { + def Extract_Element_v3i32_#Index : Extract_Element < + i32, v3i32, Index, !cast(sub#Index) + >; + def Insert_Element_v3i32_#Index : Insert_Element < + i32, v3i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v3f32_#Index : Extract_Element < + f32, v3f32, Index, !cast(sub#Index) + >; + def Insert_Element_v3f32_#Index : Insert_Element < + f32, v3f32, Index, !cast(sub#Index) + >; +} + foreach Index = 0-3 in { def Extract_Element_v4i32_#Index : Extract_Element < i32, v4i32, Index, !cast(sub#Index) @@ -869,6 +886,10 @@ def : BitConvert ; def : BitConvert ; +// 96-bit bitcast +def : BitConvert ; +def : BitConvert ; + // 128-bit bitcast def : BitConvert ; def : BitConvert ; Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -415,6 +415,8 @@ case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V128_RESTORE: return 4; + case AMDGPU::SI_SPILL_S96_SAVE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V96_RESTORE: return 3; @@ -978,12 +980,14 @@ case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: return spillSGPR(MI, FI, RS, true); case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: return restoreSGPR(MI, FI, RS, true); @@ -1012,6 +1016,7 @@ case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { spillSGPR(MI, Index, RS); @@ -1022,6 +1027,7 @@ case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { restoreSGPR(MI, Index, RS); @@ -1242,6 +1248,7 @@ &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, &AMDGPU::VReg_96RegClass, + &AMDGPU::SReg_96RegClass, &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, &AMDGPU::VReg_256RegClass, @@ -1312,6 +1319,8 @@ return &AMDGPU::SGPR_32RegClass; case 64: return &AMDGPU::SReg_64RegClass; + case 96: + return &AMDGPU::SReg_96RegClass; case 128: return &AMDGPU::SReg_128RegClass; case 256: @@ -1336,6 +1345,8 @@ return &AMDGPU::SGPR_32RegClass; case 2: return &AMDGPU::SReg_64RegClass; + case 3: + return &AMDGPU::SReg_96RegClass; case 4: return &AMDGPU::SReg_128RegClass; case 8: @@ -1603,7 +1614,7 @@ &AMDGPU::SReg_64_XEXECRegClass; case 96: return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : - nullptr; + &AMDGPU::SReg_96RegClass; case 128: return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : &AMDGPU::SReg_128RegClass; Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td @@ -177,6 +177,12 @@ [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; +// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. +def SGPR_96Regs : RegisterTuples.ret, + [(add (decimate SGPR_32, 3)), + (add (decimate (shl SGPR_32, 1), 3)), + (add (decimate (shl SGPR_32, 2), 3))]>; + // SGPR 128-bit registers def SGPR_128Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), @@ -424,23 +430,23 @@ (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID)> { - let AllocationPriority = 7; + let AllocationPriority = 8; } def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { - let AllocationPriority = 7; + let AllocationPriority = 8; } def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { - let AllocationPriority = 7; + let AllocationPriority = 8; } // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { - let AllocationPriority = 7; + let AllocationPriority = 8; } def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -450,7 +456,7 @@ def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 9; } def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { @@ -460,35 +466,47 @@ def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 9; } def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; - let AllocationPriority = 8; + let AllocationPriority = 9; } // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> { +// There are no 3-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, + (add SGPR_96Regs)> { let AllocationPriority = 10; } +def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, + (add SGPR_96)> { + let AllocationPriority = 10; +} + +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> { + let AllocationPriority = 11; +} + def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> { let isAllocatable = 0; } def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add SGPR_128, TTMP_128)> { - let AllocationPriority = 10; + let AllocationPriority = 11; } } // End CopyCost = 2 def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { - let AllocationPriority = 11; + let AllocationPriority = 13; } def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { @@ -499,11 +517,11 @@ (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; - let AllocationPriority = 11; + let AllocationPriority = 13; } def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { - let AllocationPriority = 12; + let AllocationPriority = 14; } def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> { @@ -514,7 +532,7 @@ (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; - let AllocationPriority = 12; + let AllocationPriority = 14; } def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -531,7 +549,7 @@ let AllocationPriority = 2; } -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { +def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> { let Size = 96; // Requires 3 v_mov_b32 to copy @@ -550,13 +568,13 @@ def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { let Size = 256; let CopyCost = 8; - let AllocationPriority = 5; + let AllocationPriority = 6; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { let Size = 512; let CopyCost = 16; - let AllocationPriority = 6; + let AllocationPriority = 7; } def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -814,6 +814,8 @@ case AMDGPU::VReg_64RegClassID: case AMDGPU::SReg_64_XEXECRegClassID: return 64; + case AMDGPU::SGPR_96RegClassID: + case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: Index: llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll +++ llvm/trunk/test/CodeGen/AMDGPU/call-return-types.ll @@ -174,10 +174,8 @@ ; GCN-LABEL: {{^}}test_call_external_v3i32_func_void: ; GCN: s_swappc -; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1] -; GFX7-DAG: flat_store_dword {{.*}}, v2 -; GFX89-DAG: buffer_store_dwordx2 v[0:1] -; GFX89-DAG: buffer_store_dword v2 +; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] +; GFX89-DAG: buffer_store_dwordx3 v[0:2] define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { %val = call <3 x i32> @external_v3i32_func_void() store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8 @@ -254,10 +252,8 @@ ; GCN-LABEL: {{^}}test_call_external_v3f32_func_void: ; GCN: s_swappc -; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1] -; GFX7-DAG: flat_store_dword {{.*}}, v2 -; GFX89-DAG: buffer_store_dwordx2 v[0:1] -; GFX89-DAG: buffer_store_dword v2 +; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] +; GFX89-DAG: buffer_store_dwordx3 v[0:2] define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { %val = call <3 x float> @external_v3f32_func_void() store volatile <3 x float> %val, <3 x float> addrspace(1)* undef Index: llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -1,4 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCNX3 %s ; FIXME: Most of these cases that don't trigger because of broken cost ; heuristics. Should not need -stress-early-ifcvt @@ -60,8 +61,9 @@ ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN-DAG: buffer_store_dword v -; GCN-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v +; GCNX3: buffer_store_dwordx3 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll +++ llvm/trunk/test/CodeGen/AMDGPU/early-if-convert.ll @@ -316,10 +316,10 @@ ; GCN: s_add_i32 ; GCN: s_add_i32 ; GCN: s_add_i32 -; GCN: s_add_i32 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 -; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b32 s +; GCN-NEXT: s_cselect_b32 s +; GCN-NEXT: s_cselect_b32 s define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(4)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/function-args.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/function-args.ll +++ llvm/trunk/test/CodeGen/AMDGPU/function-args.ll @@ -161,8 +161,7 @@ } ; GCN-LABEL: {{^}}void_func_v3i32: -; GCN-DAG: buffer_store_dword v2, off -; GCN-DAG: buffer_store_dwordx2 v[0:1], off +; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3i32(<3 x i32> %arg0) #0 { store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef ret void @@ -356,8 +355,7 @@ } ; GCN-LABEL: {{^}}void_func_v3f32: -; GCN-DAG: buffer_store_dword v2, off -; GCN-DAG: buffer_store_dwordx2 v[0:1], off +; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3f32(<3 x float> %arg0) #0 { store <3 x float> %arg0, <3 x float> addrspace(1)* undef ret void Index: llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll +++ llvm/trunk/test/CodeGen/AMDGPU/function-returns.ll @@ -140,7 +140,7 @@ } ; GCN-LABEL: {{^}}v3i32_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off +; GCN: buffer_load_dwordx3 v[0:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <3 x i32> @v3i32_func_void() #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/half.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/half.ll +++ llvm/trunk/test/CodeGen/AMDGPU/half.ll @@ -78,14 +78,13 @@ ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: ; GCN: s_load_dwordx2 s -; GCN: s_load_dwordx2 s -; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 +; GCN: s_load_dwordx2 s +; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN-NOT: v_cvt_f32_f16 -; GCN-DAG: _store_dword -; GCN-DAG: _store_dwordx2 +; GCN-DAG: _store_dwordx3 ; GCN: s_endpgm define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x float> @@ -472,7 +471,7 @@ } ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: -; GCN: flat_load_dwordx4 +; GCN: flat_load_dwordx3 ; GCN-DAG: v_cvt_f16_f32_e32 ; SI-DAG: v_cvt_f16_f32_e32 ; VI-DAG: v_cvt_f16_f32_sdwa Index: llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll +++ llvm/trunk/test/CodeGen/AMDGPU/idot4u.ll @@ -1778,29 +1778,29 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 -; GFX8-NEXT: s_lshr_b32 s4, s2, 24 -; GFX8-NEXT: s_and_b32 s6, s1, s0 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_and_b32 s7, s1, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s6, v4 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s5, v5 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_and_b32 s5, s0, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s4, v5 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD Index: llvm/trunk/test/CodeGen/AMDGPU/load-global-f32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-global-f32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-global-f32.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC -check-prefix=SI-NOHSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC -check-prefix=GCNX3-HSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC -check-prefix=GCNX3-NOHSA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s @@ -30,8 +30,9 @@ } ; FUNC-LABEL: {{^}}global_load_v3f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx3 +; GCNX3-HSA: flat_load_dwordx3 ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,8 +1,8 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: @@ -30,8 +30,9 @@ } ; FUNC-LABEL: {{^}}global_load_v3i32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: {{flat|global}}_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx3 +; GCNX3-HSA: {{flat|global}}_load_dwordx3 ; EG: VTX_READ_128 define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ llvm/trunk/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -146,12 +146,11 @@ ; FIXME: Should be packed into 2 registers per argument? ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX9-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp +; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}} ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> Index: llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll +++ llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll @@ -275,8 +275,7 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v +; SI-DAG: buffer_load_dwordx4 ; CI-DAG: buffer_load_dwordx3 ; GCN: s_waitcnt ; SI-DAG: buffer_store_dwordx2 @@ -566,8 +565,8 @@ ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dword v +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v ; CI: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 @@ -615,8 +614,7 @@ ; GCN-LABEL: {{^}}copy_v3i32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt @@ -650,8 +648,7 @@ ; GCN-LABEL: {{^}}copy_v3f32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt Index: llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll +++ llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll @@ -0,0 +1,216 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s + +; CHECK-LABEL: spill_v2i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload + +define void @spill_v2i32() { +entry: + %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v2f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload + +define void @spill_v2f32() { +entry: + %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v3i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload + +define void @spill_v3i32() { +entry: + %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v3f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload + +define void @spill_v3f32() { +entry: + %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v4i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload + +define void @spill_v4i32() { +entry: + %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v4f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload + +define void @spill_v4f32() { +entry: + %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v5i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload + +define void @spill_v5i32() { +entry: + %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v5f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload + +define void @spill_v5f32() { +entry: + %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr + + ret void +} + + + Index: llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll +++ llvm/trunk/test/CodeGen/AMDGPU/select-vectors.ll @@ -112,6 +112,7 @@ ; GFX89: v_cndmask_b32_e32 ; GFX89: cndmask +; VI: cndmask ; GFX89-NOT: cndmask define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr @@ -230,6 +231,21 @@ ret void } +; GCN-LABEL: {{^}}s_select_v3f32: +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} + +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 + +; GCN: buffer_store_dwordx +define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16 + ret void +} + ; GCN-LABEL: {{^}}s_select_v4f32: ; GCN: s_load_dwordx4 ; GCN: s_load_dwordx4 Index: llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,SI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,CI %s ; Check that an addrspace(1) (const) load with various combinations of ; uniform, nonuniform and constant address components all load with an @@ -8,7 +9,8 @@ ; GCN-LABEL: {{^}}nonuniform_uniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { .entry: @@ -21,7 +23,8 @@ ; GCN-LABEL: {{^}}uniform_nonuniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { .entry: @@ -35,7 +38,8 @@ ; GCN-LABEL: {{^}}const_nonuniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @const_nonuniform(i32 %arg18) { .entry: @@ -49,7 +53,8 @@ ; GCN-LABEL: {{^}}nonuniform_nonuniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { .entry: Index: llvm/trunk/test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sign_extend.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sign_extend.ll @@ -37,13 +37,12 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s2 +; SI-NEXT: s_mul_i32 s2, s4, s5 +; SI-NEXT: s_add_i32 s4, s2, s6 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -54,16 +53,15 @@ ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s1, s2, s3 -; VI-NEXT: s_add_i32 s1, s1, s0 -; VI-NEXT: s_ashr_i32 s0, s1, 31 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: s_ashr_i32 s1, s0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm entry: @@ -292,14 +290,13 @@ ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -308,14 +305,13 @@ ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 Index: llvm/trunk/test/CodeGen/AMDGPU/spill-wide-sgpr.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ llvm/trunk/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -43,6 +43,54 @@ ret void } +; ALL-LABEL: {{^}}spill_sgpr_x3: +; SMEM: s_add_u32 m0, s3, 0x100{{$}} +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_cbranch_scc1 + +; SMEM: s_add_u32 m0, s3, 0x100{{$}} +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 16 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 + + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <3 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<3 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + ; ALL-LABEL: {{^}}spill_sgpr_x4: ; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill Index: llvm/trunk/test/CodeGen/AMDGPU/store-global.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/store-global.ll +++ llvm/trunk/test/CodeGen/AMDGPU/store-global.ll @@ -273,13 +273,12 @@ } ; FUNC-LABEL: {{^}}store_v3i32: -; SI-DAG: buffer_store_dwordx2 ; SI-DAG: buffer_store_dword v +; SI-DAG: buffer_store_dwordx2 -; VI-DAG: buffer_store_dwordx3 +; VI: buffer_store_dwordx3 -; GFX9-DAG: global_store_dwordx2 -; GFX9-DAG: global_store_dword v +; GFX9: global_store_dwordx3 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}}, Index: llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll +++ llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll @@ -135,7 +135,7 @@ ; GCN-LABEL: {{^}}safe_mad_sub0_src0: ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, -; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} +; GCN: v_ma{{[cd]}}_f32{{[_e32]*}} v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 Index: llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -22,25 +22,25 @@ ; MESA-NOT: s_mov_b32 s3 ; HSA-NOT: s_mov_b32 s7 -; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCNMESA-DAG: s_mov_b32 s14, -1 -; SIMESA-DAG: s_mov_b32 s15, 0xe8f000 -; VIMESA-DAG: s_mov_b32 s15, 0xe80000 -; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000 - - -; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill - -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} - -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCNMESA-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCNMESA-DAG: s_mov_b32 s18, -1 +; SIMESA-DAG: s_mov_b32 s19, 0xe8f000 +; VIMESA-DAG: s_mov_b32 s19, 0xe80000 +; GFX9MESA-DAG: s_mov_b32 s19, 0xe00000 + + +; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} + +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}}