Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -799,6 +799,9 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; +def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"!FeatureVOP3P">; + def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3006,7 +3006,7 @@ SDValue X = LHS->getOperand(0); if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && - isTypeLegal(MVT::v2i16)) { + isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { // Prefer build_vector as the canonical form if packed types are legal. // (shl ([asz]ext i16:x), 16 -> build_vector 0, x SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, @@ -3817,12 +3817,13 @@ // TODO: Generalize and move to DAGCombiner SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast(Src)) { - assert(Src.getValueType() == MVT::i64); - SDLoc SL(N); - uint64_t CVal = C->getZExtValue(); - return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + if (Src.getValueType() == MVT::i64) { + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } } if (ConstantFPSDNode *C = dyn_cast(Src)) { Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1060,14 +1060,14 @@ defm : MUBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. @@ -1547,14 +1547,14 @@ defm : MTBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat; - defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -610,10 +610,7 @@ let SubtargetPredicate = HasPackedD16VMem in { def _packed_v1 : ImageDimPattern; - // used on gfx810 - def _packed_v2 : ImageDimPattern; - // used on gfx900 - def _packed_v2_gfx9 : ImageDimPattern; + def _packed_v2 : ImageDimPattern; def _packed_v4 : ImageDimPattern; } // End HasPackedD16VMem. } @@ -717,7 +714,7 @@ } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -780,7 +777,7 @@ } // End HasUnPackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageLoadDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -865,8 +862,8 @@ defm : ImageLoadAltPatterns; // Image store. -defm : ImageStorePatterns; -defm : ImageStorePatterns; +defm : ImageStorePatterns; +defm : ImageStorePatterns; defm : ImageStoreAltPatterns; defm : ImageStoreAltPatterns; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -60,8 +60,10 @@ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerIntrinsicWChain_IllegalReturnType(SDValue Op, SDValue &Chain, - SelectionDAG &DAG) const; + SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic = false) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -138,9 +138,8 @@ if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); - } - if (Subtarget->hasVOP3PInsts()) { + // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); } @@ -173,7 +172,6 @@ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -422,9 +420,7 @@ setOperationAction(ISD::FMA, MVT::f16, Legal); if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - } - if (Subtarget->hasVOP3PInsts()) { for (MVT VT : {MVT::v2i16, MVT::v2f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { @@ -471,11 +467,34 @@ AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); setOperationAction(ISD::XOR, MVT::v2i16, Promote); AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2i16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2f16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v4i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + + if (!Subtarget->hasVOP3PInsts()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + } + + if (Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::ADD, MVT::v2i16, Legal); setOperationAction(ISD::SUB, MVT::v2i16, Legal); setOperationAction(ISD::MUL, MVT::v2i16, Legal); @@ -488,25 +507,23 @@ setOperationAction(ISD::UMAX, MVT::v2i16, Legal); setOperationAction(ISD::FADD, MVT::v2f16, Legal); - setOperationAction(ISD::FNEG, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); - // This isn't really legal, but this avoids the legalizer unrolling it (and - // allows matching fneg (fabs x) patterns) - setOperationAction(ISD::FABS, MVT::v2f16, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + } - setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); } else { + // Legalization hack. setOperationAction(ISD::SELECT, MVT::v2i16, Custom); setOperationAction(ISD::SELECT, MVT::v2f16, Custom); } @@ -3513,205 +3530,72 @@ return 0; } -static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL, - SelectionDAG &DAG, bool Unpacked) { +static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, + const SDLoc &DL, + SelectionDAG &DAG, bool Unpacked) { + if (!LoadVT.isVector()) + return Result; + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. // Truncate to v2i16/v4i16. EVT IntLoadVT = LoadVT.changeTypeToInteger(); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result); + + // Workaround legalizer not scalarizing truncate after vector op + // legalization byt not creating intermediate vector trunc. + SmallVector Elts; + DAG.ExtractVectorElements(Result, Elts); + for (SDValue &Elt : Elts) + Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); + // Bitcast to original type (v2f16/v4f16). - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } + // Cast back to the original packed type. return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } -// This is to lower INTRINSIC_W_CHAIN with illegal result types. -SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op, - SDValue &Chain, SelectionDAG &DAG) const { - EVT LoadVT = Op.getValueType(); - // TODO: handle v3f16. - if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16) - return SDValue(); +SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, + MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic) const { + SDLoc DL(M); + SmallVector Ops; + Ops.reserve(M->getNumOperands()); - bool Unpacked = Subtarget->hasUnpackedD16VMem(); - EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; - EVT EquivLoadVT = Unpacked ? UnpackedLoadVT : - getEquivalentMemType(*DAG.getContext(), LoadVT); - // Change from v4f16/v2f16 to EquivLoadVT. - SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); + Ops.push_back(M->getOperand(0)); + if (IsIntrinsic) + Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32)); - SDValue Res; - SDLoc DL(Op); - MemSDNode *M = cast(Op); - unsigned IID = cast(Op.getOperand(1))->getZExtValue(); - switch (IID) { - case Intrinsic::amdgcn_tbuffer_load: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, - VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - case Intrinsic::amdgcn_buffer_load_format: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - case Intrinsic::amdgcn_image_load: - case Intrinsic::amdgcn_image_load_mip: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vaddr - Op.getOperand(3), // rsrc - Op.getOperand(4), // dmask - Op.getOperand(5), // glc - Op.getOperand(6), // slc - Op.getOperand(7), // lwe - Op.getOperand(8) // da - }; - unsigned Opc = getImageOpcode(IID); - Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - // Basic sample. - case Intrinsic::amdgcn_image_sample: - case Intrinsic::amdgcn_image_sample_cl: - case Intrinsic::amdgcn_image_sample_d: - case Intrinsic::amdgcn_image_sample_d_cl: - case Intrinsic::amdgcn_image_sample_l: - case Intrinsic::amdgcn_image_sample_b: - case Intrinsic::amdgcn_image_sample_b_cl: - case Intrinsic::amdgcn_image_sample_lz: - case Intrinsic::amdgcn_image_sample_cd: - case Intrinsic::amdgcn_image_sample_cd_cl: - - // Sample with comparison. - case Intrinsic::amdgcn_image_sample_c: - case Intrinsic::amdgcn_image_sample_c_cl: - case Intrinsic::amdgcn_image_sample_c_d: - case Intrinsic::amdgcn_image_sample_c_d_cl: - case Intrinsic::amdgcn_image_sample_c_l: - case Intrinsic::amdgcn_image_sample_c_b: - case Intrinsic::amdgcn_image_sample_c_b_cl: - case Intrinsic::amdgcn_image_sample_c_lz: - case Intrinsic::amdgcn_image_sample_c_cd: - case Intrinsic::amdgcn_image_sample_c_cd_cl: - - // Sample with offsets. - case Intrinsic::amdgcn_image_sample_o: - case Intrinsic::amdgcn_image_sample_cl_o: - case Intrinsic::amdgcn_image_sample_d_o: - case Intrinsic::amdgcn_image_sample_d_cl_o: - case Intrinsic::amdgcn_image_sample_l_o: - case Intrinsic::amdgcn_image_sample_b_o: - case Intrinsic::amdgcn_image_sample_b_cl_o: - case Intrinsic::amdgcn_image_sample_lz_o: - case Intrinsic::amdgcn_image_sample_cd_o: - case Intrinsic::amdgcn_image_sample_cd_cl_o: + // Skip 1, as it is the intrinsic ID. + for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I) + Ops.push_back(M->getOperand(I)); - // Sample with comparison and offsets. - case Intrinsic::amdgcn_image_sample_c_o: - case Intrinsic::amdgcn_image_sample_c_cl_o: - case Intrinsic::amdgcn_image_sample_c_d_o: - case Intrinsic::amdgcn_image_sample_c_d_cl_o: - case Intrinsic::amdgcn_image_sample_c_l_o: - case Intrinsic::amdgcn_image_sample_c_b_o: - case Intrinsic::amdgcn_image_sample_c_b_cl_o: - case Intrinsic::amdgcn_image_sample_c_lz_o: - case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + EVT LoadVT = M->getValueType(0); - // Basic gather4 - case Intrinsic::amdgcn_image_gather4: - case Intrinsic::amdgcn_image_gather4_cl: - case Intrinsic::amdgcn_image_gather4_l: - case Intrinsic::amdgcn_image_gather4_b: - case Intrinsic::amdgcn_image_gather4_b_cl: - case Intrinsic::amdgcn_image_gather4_lz: + EVT UnpackedLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; + EVT EquivLoadVT = LoadVT; + if (LoadVT.isVector()) { + EquivLoadVT = Unpacked ? UnpackedLoadVT : + getEquivalentMemType(*DAG.getContext(), LoadVT); + } - // Gather4 with comparison - case Intrinsic::amdgcn_image_gather4_c: - case Intrinsic::amdgcn_image_gather4_c_cl: - case Intrinsic::amdgcn_image_gather4_c_l: - case Intrinsic::amdgcn_image_gather4_c_b: - case Intrinsic::amdgcn_image_gather4_c_b_cl: - case Intrinsic::amdgcn_image_gather4_c_lz: + // Change from v4f16/v2f16 to EquivLoadVT. + SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); - // Gather4 with offsets - case Intrinsic::amdgcn_image_gather4_o: - case Intrinsic::amdgcn_image_gather4_cl_o: - case Intrinsic::amdgcn_image_gather4_l_o: - case Intrinsic::amdgcn_image_gather4_b_o: - case Intrinsic::amdgcn_image_gather4_b_cl_o: - case Intrinsic::amdgcn_image_gather4_lz_o: + SDValue Load + = DAG.getMemIntrinsicNode(IsIntrinsic ? ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); - // Gather4 with comparison and offsets - case Intrinsic::amdgcn_image_gather4_c_o: - case Intrinsic::amdgcn_image_gather4_c_cl_o: - case Intrinsic::amdgcn_image_gather4_c_l_o: - case Intrinsic::amdgcn_image_gather4_c_b_o: - case Intrinsic::amdgcn_image_gather4_c_b_cl_o: - case Intrinsic::amdgcn_image_gather4_c_lz_o: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vaddr - Op.getOperand(3), // rsrc - Op.getOperand(4), // sampler - Op.getOperand(5), // dmask - Op.getOperand(6), // unorm - Op.getOperand(7), // glc - Op.getOperand(8), // slc - Op.getOperand(9), // lwe - Op.getOperand(10) // da - }; - unsigned Opc = getImageOpcode(IID); - Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - default: { - const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = - AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID); - if (D16ImageDimIntr) { - SmallVector Ops; - for (auto Value : Op.getNode()->op_values()) - Ops.push_back(Value); - Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); - Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops, - M->getMemoryVT(), M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } + SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); - return SDValue(); - } - } + return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3766,13 +3650,12 @@ break; } case ISD::INTRINSIC_W_CHAIN: { - SDValue Chain; - if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0), - Chain, DAG)) { + if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } + break; } case ISD::SELECT: { @@ -4278,22 +4161,24 @@ SelectionDAG &DAG) const { SDLoc SL(Op); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + assert(VT == MVT::v2f16 || VT == MVT::v2i16); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); - // Turn into pair of packed build_vectors. - // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, + DAG.getConstant(16, SL, MVT::i32)); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); - return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + + return DAG.getNode(ISD::BITCAST, SL, VT, Or); } bool @@ -4828,13 +4713,23 @@ AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG); + } + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4848,10 +4743,9 @@ Op.getOperand(10) // slc }; - EVT VT = Op.getValueType(); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4932,6 +4826,18 @@ Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: { + EVT LoadVT = Op.getValueType(); + if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) || + LoadVT == MVT::v4f16) { + MemSDNode *M = cast(Op); + return adjustLoadValueType(getImageOpcode(IntrID), M, DAG); + } + + return SDValue(); + } + // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -4978,7 +4884,39 @@ case Intrinsic::amdgcn_image_sample_c_b_cl_o: case Intrinsic::amdgcn_image_sample_c_lz_o: case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { // Replace dmask with everything disabled with undef. const ConstantSDNode *DMask = dyn_cast(Op.getOperand(5)); if (!DMask || DMask->isNullValue()) { @@ -4986,9 +4924,32 @@ return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); } + if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) || + Op.getValueType() == MVT::v4f16) { + return adjustLoadValueType(getImageOpcode(IntrID), cast(Op), + DAG); + } + return SDValue(); } default: + EVT LoadVT = Op.getValueType(); + if (LoadVT.getScalarSizeInBits() != 16) + return SDValue(); + + const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID); + if (D16ImageDimIntr) { + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + MemSDNode *M = cast(Op); + + if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16)) + return SDValue(); + + return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, + M, DAG, true); + } + return SDValue(); } } @@ -4996,26 +4957,32 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); + + // No change for f16 and legal vector D16 types. + if (!StoreVT.isVector()) + return VData; + SDLoc DL(VData); + assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); - if (StoreVT.isVector()) { - assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); - if (!Subtarget->hasUnpackedD16VMem()) { - if (!isTypeLegal(StoreVT)) { - // If Target supports packed vmem, we just need to workaround - // the illegal type by casting to an equivalent one. - EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); - return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); - } - } else { // We need to unpack the packed data to store. - EVT IntStoreVT = StoreVT.changeTypeToInteger(); - SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); - EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; - return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); - } + if (Subtarget->hasUnpackedD16VMem()) { + // We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + StoreVT.getVectorNumElements()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + return DAG.UnrollVectorOp(ZExt.getNode()); } - // No change for f16 and legal vector D16 types. - return VData; + + if (isTypeLegal(StoreVT)) + return VData; + + // If target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); + return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -5206,46 +5173,48 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_image_store: case Intrinsic::amdgcn_image_store_mip: { SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) + if ((Subtarget->hasUnpackedD16VMem() && + VData.getValueType() == MVT::v2f16) || + VData.getValueType() == MVT::v4f16) { + SDValue Chain = Op.getOperand(0); + VData = handleD16VData(VData, DAG); - SDValue Ops[] = { - Chain, // Chain - VData, // vdata - Op.getOperand(3), // vaddr - Op.getOperand(4), // rsrc - Op.getOperand(5), // dmask - Op.getOperand(6), // glc - Op.getOperand(7), // slc - Op.getOperand(8), // lwe - Op.getOperand(9) // da - }; - unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? - AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; - MemSDNode *M = cast(Op); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // vaddr + Op.getOperand(4), // rsrc + Op.getOperand(5), // dmask + Op.getOperand(6), // glc + Op.getOperand(7), // slc + Op.getOperand(8), // lwe + Op.getOperand(9) // da + }; + unsigned Opc = (IntrinsicID == Intrinsic::amdgcn_image_store) ? + AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + return SDValue(); + } default: { const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID); if (D16ImageDimIntr) { SDValue VData = Op.getOperand(2); EVT StoreVT = VData.getValueType(); - if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) || - StoreVT == MVT::v4f16) { - VData = handleD16VData(VData, DAG); + if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) && + Subtarget->hasUnpackedD16VMem()) || + !isTypeLegal(StoreVT)) { + SmallVector Ops(Op.getNode()->op_values()); - SmallVector Ops; - for (auto Value : Op.getNode()->op_values()) - Ops.push_back(Value); Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); - Ops[2] = VData; + Ops[2] = handleD16VData(VData, DAG); MemSDNode *M = cast(Op); return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(), Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -871,11 +871,13 @@ def : ClampPat; def : ClampPat; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), (V_PK_MAX_F16 $src0_modifiers, $src0, $src0_modifiers, $src0, DSTCLAMP.ENABLE) >; +} /********** ================================ **********/ /********** Floating point absolute/negative **********/ @@ -1333,11 +1335,13 @@ (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) >; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) >; } +} let OtherPredicates = [NoFP32Denormals] in { def : GCNPat< @@ -1387,11 +1391,6 @@ def : ExpPattern; def : ExpPattern; -def : GCNPat < - (v2i16 (build_vector i16:$src0, i16:$src1)), - (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) ->; - // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < @@ -1399,6 +1398,13 @@ (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0)) >; + +let SubtargetPredicate = HasVOP3PInsts in { +def : GCNPat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < @@ -1406,6 +1412,7 @@ (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) >; + def : GCNPat < (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), @@ -1418,6 +1425,9 @@ (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) >; +} // End SubtargetPredicate = HasVOP3PInsts + + // def : GCNPat < // (v2f16 (scalar_to_vector f16:$src0)), // (COPY $src0) Index: test/CodeGen/AMDGPU/add.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/add.v2i16.ll +++ test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,12 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_v2i16: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; FIXME: or should be unnecessary ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32 define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -52,21 +54,26 @@ ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u32 -; VI: v_add_u32_sdwa +; VI: s_add_i32 +; VI: s_add_i32 +; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_and_b32 +; VI: s_or_b32 define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void } +; FIXME: Eliminate or with sdwa ; GCN-LABEL: {{^}}v_test_add_v2i16_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -84,7 +91,7 @@ ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -99,10 +106,9 @@ ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 -; VI: flat_load_ushort [[LOAD0:v[0-9]+]] -; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] +; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]] ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -117,10 +123,11 @@ ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} +; VI: flat_load_dword ; VI-NOT: v_add_u16 +; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000, ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} ; VI-NOT: v_add_u16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -139,9 +146,9 @@ ; VI-NOT: v_add_u16 ; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80 -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NOT: v_add_u16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -162,15 +169,13 @@ ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] -; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] +; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} @@ -198,13 +203,11 @@ ; GFX9: buffer_store_dwordx4 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] ; VI-DAG: v_add_u16_e32 -; VI-DAG: v_add_u16_e32 +; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -230,8 +233,9 @@ ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_add_u16_e32 -; VI: v_add_u16_e32 + ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: buffer_store_dwordx2 Index: test/CodeGen/AMDGPU/ashr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/ashr.v2i16.ll +++ test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -8,8 +8,17 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_load_dword [[LHS:s[0-9]+]] +; VI: s_load_dword [[RHS:s[0-9]+]] +; VI: s_ashr_i32 +; VI: s_ashr_i32 +; VI: s_sext_i32_i16 +; VI: s_sext_i32_i16 +; VI: s_ashr_i32 +; VI: s_ashr_i32 +; VI: s_lshl_b32 +; VI: s_and_b32 +; VI: s_or_b32 ; CI-DAG: v_ashrrev_i32_e32 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -71,10 +71,15 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_store_short +; SICI: buffer_store_short + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: buffer_store_short +; VI: buffer_store_short ; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c ; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30 @@ -92,9 +97,16 @@ } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort + +; SICI: buffer_store_short +; SICI: buffer_store_short +; SICI: buffer_store_short + +; SICI: buffer_load_ushort +; SICI: buffer_store_short ; GFX9-DAG: global_load_short_d16_hi v ; GFX9-DAG: global_load_short_d16 v Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) @@ -36,16 +36,8 @@ ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_or_b32_e32 -; VI: flat_load_ushort [[HI:v[0-9]+]] -; VI: flat_load_ushort [[LO:v[0-9]+]] -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]] -; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]] -; VI: flat_store_dword - -; GFX9: s_load_dword [[VAL:s[0-9]+]] -; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +; GFX89: s_load_dword [[VAL:s[0-9]+]] +; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out @@ -59,13 +51,12 @@ ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX89: s_load_dword s +; GFX89: s_load_dword s +; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff +; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { @@ -147,9 +138,9 @@ ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_mul_f16_e64 v{{[0-9]+}}, |[[VAL]]|, 4.0 ; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-DAG: v_add_f16_sdwa v{{[0-9]+}}, |[[VAL]]|, [[CONST2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid @@ -167,11 +158,12 @@ ; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; FIXME: Extra bfe on VI -; GFX9-NOT: v_bfe_u32 -; VI: v_bfe_u32 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]] + + +; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 15 +; VI: flat_store_short + ; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -222,12 +222,9 @@ ret void } -; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: -; VI-DAG: v_bfe_u32 -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} -; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI-NOT: 0xffff ; VI: v_or_b32 @@ -245,9 +242,8 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} @@ -265,9 +261,8 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: -; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]] +; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} ; VI-NOT: 0xffff ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}} Index: test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- test/CodeGen/AMDGPU/fmax3.ll +++ test/CodeGen/AMDGPU/fmax3.ll @@ -94,12 +94,13 @@ ; SI-NEXT: v_max3_f32 ; SI-NEXT: v_max3_f32 -; VI: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_max_f16_e32 v0, v0, v1 +; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_max_f16_e32 v0, v2, v0 +; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_max_f16_e32 v0, v0, v3 +; VI: v_or_b32_e32 v0, v0, v1 ; GFX9: v_pk_max_f16 ; GFX9-NEXT: v_pk_max_f16 Index: test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- test/CodeGen/AMDGPU/fmin3.ll +++ test/CodeGen/AMDGPU/fmin3.ll @@ -92,12 +92,13 @@ ; SI-NEXT: v_min3_f32 ; SI-NEXT: v_min3_f32 -; VI: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 +; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_min_f16_e32 v0, v0, v1 +; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_min_f16_e32 v0, v2, v0 +; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_min_f16_e32 v0, v0, v3 +; VI: v_or_b32_e32 v0, v0, v1 ; GFX9: v_pk_min_f16 ; GFX9: v_pk_min_f16 Index: test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -73,12 +73,9 @@ ; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}} ; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]] ; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]] -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; CIVI: flat_store_dword +; FIXME: Random commute +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -95,14 +92,13 @@ ; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]] ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]] ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]] -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], - -; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 + +; FIXME: Random commute +; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 + +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] + ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} @@ -120,7 +116,7 @@ ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|, 4.0 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff Index: test/CodeGen/AMDGPU/fneg.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fneg.f16.ll +++ test/CodeGen/AMDGPU/fneg.f16.ll @@ -60,7 +60,8 @@ ret void } -; FIXME: Terrible code with VI and even worse with SI/CI +; FIXME: Terrible code with SI/CI. +; FIXME: scalar for VI, vector for gfx9 ; GCN-LABEL: {{^}}s_fneg_v2f16: ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} @@ -68,12 +69,9 @@ ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}} -; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} - define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fneg = fsub <2 x half> , %in store <2 x half> %fneg, <2 x half> addrspace(1)* %out Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; half args should be promoted to float for SI and lower. @@ -13,13 +13,17 @@ ret void } +; FIXME: Should always be the same ; GCN-LABEL: {{^}}load_v2f16_arg: -; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] -; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: s_endpgm +; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 +; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} + +; VI: s_load_dword [[ARG:s[0-9]+]] +; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] +; VI: buffer_store_dword [[V_ARG]] define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out ret void @@ -40,12 +44,18 @@ } ; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_dwordx2 + +; FIXME: Why not one load? +; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]] +; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]] +; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void @@ -104,14 +114,20 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -145,8 +161,12 @@ } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI-DAG: buffer_load_ushort v +; SI-DAG: buffer_load_ushort v + +; VI-DAG: s_load_dword s +; VI: s_lshr_b32 + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f64_f32_e32 @@ -176,10 +196,14 @@ } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + +; VI: s_load_dword s +; VI: s_load_dword s + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -196,15 +220,23 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s + -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 Index: test/CodeGen/AMDGPU/immv216.ll =================================================================== --- test/CodeGen/AMDGPU/immv216.ll +++ test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: @@ -120,11 +120,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -138,11 +141,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -156,11 +162,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -174,11 +183,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -192,11 +204,15 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00 +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -210,11 +226,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -228,11 +247,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -246,11 +268,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -264,11 +289,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -321,11 +349,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -339,11 +370,15 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -357,11 +392,15 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -375,10 +414,9 @@ ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, -1, [[REG]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -393,10 +431,9 @@ ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, 0xfffefffe, [[REG]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -411,10 +448,10 @@ ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, 0xfff0fff0, [[REG]] + +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -429,11 +466,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -447,11 +487,14 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { Index: test/CodeGen/AMDGPU/inlineasm-illegal-type.ll =================================================================== --- test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -42,16 +42,19 @@ ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' +; CI: error: couldn't allocate output register for constraint 's' +; CI: error: couldn't allocate input reg for constraint 's' + +; VI-NOT: error define amdgpu_kernel void @s_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x half> %v) ret void } -; GCN: error: couldn't allocate output register for constraint 'v' -; GCN: error: couldn't allocate input reg for constraint 'v' +; CI: error: couldn't allocate output register for constraint 'v' +; CI: error: couldn't allocate input reg for constraint 'v' +; VI-NOT: error define amdgpu_kernel void @v_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(<2 x half> %v) @@ -67,8 +70,12 @@ ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' +; FIXME: Should work on all targets? + +; CI: error: couldn't allocate output register for constraint 's' +; CI: error: couldn't allocate input reg for constraint 's' + +; VI-NOT: error define amdgpu_kernel void @s_input_output_v2i16() { %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v) Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: ; GCN: s_load_dword [[VEC:s[0-9]+]] @@ -39,11 +39,21 @@ ; GCN: s_load_dword [[ELT0:s[0-9]+]] ; GCN: s_load_dword [[VEC:s[0-9]+]] -; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} -; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 -; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 -; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] -; CIVI-DAG: ; use [[SHR]] +; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 +; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CI-DAG: ; use [[SHR]] + + +; FIXME: Should be able to void mask of upper bits +; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]] +; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 + +; VI-DAG: ; use [[SHR]] + ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] @@ -103,10 +113,16 @@ ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] ; GCN: s_load_dword [[VEC:s[0-9]+]], -; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 -; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 -; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 -; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] +; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; CI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 +; CI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] + + +; VI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; VI-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 +; VI: s_and_b32 [[MASK_HI:s[0-9]+]], [[VEC]], 0xffff0000 +; VI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[MASK_HI]] ; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 ; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -1,8 +1,8 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}i8_arg: ; HSA-VI: kernarg_segment_alignment = 4 @@ -162,10 +162,11 @@ ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out @@ -285,14 +286,14 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -305,6 +306,7 @@ ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X + ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 @@ -370,22 +372,20 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out @@ -502,38 +502,32 @@ ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s @@ -13,9 +13,12 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xy: -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { @@ -26,17 +29,27 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; +; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}image_load_f16 ; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 @@ -58,11 +58,17 @@ ret void } -; GCN-LABEL: {{^}}image_store_v2f16 +; FIXME: Eliminate and to get low bits +; GCN-LABEL: {{^}}image_store_v2f16: +; UNPACKED: s_load_dword [[DATA:s[0-9]+]] +; UNPACKED-DAG: s_lshr_b32 [[UNPACK_1:s[0-9]+]], [[DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff +; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]] -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 + + +; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 ; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { @@ -72,20 +78,19 @@ } ; GCN-LABEL: {{^}}image_store_v4f16 - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 - -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] - -; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKED: s_load_dword s +; UNPACKED: s_load_dword s +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_and_b32 +; UNPACKED: s_and_b32 +; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; PACKED: s_load_dword [[DATA0:s[0-9]+]] +; PACKED: s_load_dword [[DATA1:s[0-9]+]] +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) @@ -93,20 +98,19 @@ } ; GCN-LABEL: {{^}}image_store_mip_v4f16 - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 - -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] - -; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKD: s_load_dword s +; UNPACKD: s_load_dword s +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_and_b32 +; UNPACKED: s_and_b32 +; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; PACKED: s_load_dword [[DATA0:s[0-9]+]] +; PACKED: s_load_dword [[DATA1:s[0-9]+]] +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}load_1d: Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s @@ -12,12 +12,13 @@ ret void } - ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen ; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { @@ -26,21 +27,23 @@ ret void } - ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: Index: test/CodeGen/AMDGPU/llvm.fma.f16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -145,8 +145,12 @@ } ; GCN-LABEL: {{^}}fma_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] + +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] + ; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} @@ -185,8 +189,8 @@ ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} @@ -228,8 +232,8 @@ ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} Index: test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/lshr.v2i16.ll +++ test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; GCN-LABEL: {{^}}s_lshr_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -8,11 +8,20 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; VI: s_load_dword [[LHS:s[0-9]+]] +; VI: s_load_dword [[RHS:s[0-9]+]] +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 +; VI-DAG: s_lshl_b32 +; VI: v_or_b32_e32 + ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/min.ll =================================================================== --- test/CodeGen/AMDGPU/min.ll +++ test/CodeGen/AMDGPU/min.ll @@ -117,8 +117,10 @@ ; SI: v_min_i32 ; SI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_sext_i32_i16 +; VI: s_sext_i32_i16 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_pk_min_i16 @@ -131,17 +133,16 @@ ret void } -; FIXME: VI use s_min_i32 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_pk_min_i16 ; GFX9: v_pk_min_i16 @@ -461,14 +462,14 @@ ; SI: v_min_u32 ; SI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT Index: test/CodeGen/AMDGPU/reduction.ll =================================================================== --- test/CodeGen/AMDGPU/reduction.ll +++ test/CodeGen/AMDGPU/reduction.ll @@ -5,7 +5,7 @@ ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 +; VI: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 define half @reduction_half4(<4 x half> %vec4) { @@ -22,7 +22,7 @@ ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_u16_e32 +; VI: v_add_u16_sdwa ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 define i16 @reduction_v4i16(<4 x i16> %vec4) { @@ -41,8 +41,8 @@ ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 +; VI: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -67,8 +67,8 @@ ; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_u16_e32 -; VI-NEXT: v_add_u16_e32 +; VI: v_add_u16_sdwa +; VI-NEXT: v_add_u16_sdwa ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 @@ -97,10 +97,10 @@ ; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 +; VI: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -131,7 +131,7 @@ ; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_u16_e32 +; VI: v_min_u16_sdwa ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 define i16 @reduction_min_v4i16(<4 x i16> %vec4) { @@ -152,8 +152,8 @@ ; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}} ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_u16_e32 -; VI-NEXT: v_min_u16_e32 +; VI: v_min_u16_sdwa +; VI-NEXT: v_min_u16_sdwa ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 @@ -224,10 +224,10 @@ ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 +; VI: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa ; VI-NEXT: v_min_i16_e32 ; VI-NEXT: v_min_i16_e32 ; VI-NEXT: v_min_i16_e32 @@ -339,7 +339,7 @@ ; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_u16_e32 +; VI: v_max_u16_sdwa ; VI-NEXT: v_max_u16_e32 ; VI-NEXT: v_max_u16_e32 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { @@ -358,7 +358,7 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_i16_e32 +; VI: v_max_i16_sdwa ; VI-NEXT: v_max_i16_e32 ; VI-NEXT: v_max_i16_e32 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 { @@ -377,7 +377,7 @@ ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_f16_e32 +; VI: v_max_f16_sdwa ; VI-NEXT: v_max_f16_e32 ; VI-NEXT: v_max_f16_e32 define half @reduction_fmax_v4half(<4 x half> %vec4) { @@ -396,7 +396,7 @@ ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_f16_e32 +; VI: v_min_f16_sdwa ; VI-NEXT: v_min_f16_e32 ; VI-NEXT: v_min_f16_e32 define half @reduction_fmin_v4half(<4 x half> %vec4) { @@ -409,4 +409,4 @@ %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res -} \ No newline at end of file +} Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDWA,GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- test/CodeGen/AMDGPU/select-vectors.ll +++ test/CodeGen/AMDGPU/select-vectors.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. @@ -76,8 +76,14 @@ } ; GCN-LABEL: {{^}}select_v2i16: -; GCN: v_cndmask_b32_e32 -; GCN-NOT: v_cndmask_b32 +; GFX89: s_load_dword +; GFX89: s_load_dword +; GFX89: s_load_dword +; GFX89: v_cndmask_b32 +; GFX89-NOT: v_cndmask_b32 + +; SI: v_cndmask_b32_e32 +; SI-NOT: v_cndmask_b32e define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b @@ -86,7 +92,9 @@ } ; GCN-LABEL: {{^}}v_select_v2i16: -; GCN: v_cndmask_b32_e32 +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: v_cndmask_b32 ; GCN-NOT: cndmask define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr @@ -330,7 +338,7 @@ } ; GCN-LABEL: {{^}}v_select_v2f16: -; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32 ; GCN-NOT: cndmask define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr Index: test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/shl.v2i16.ll +++ test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; GCN-LABEL: {{^}}s_shl_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -8,9 +8,14 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_lshlrev_b32_e32 -; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_lshr_b32 +; VI: s_lshr_b32 +; VI: s_and_b32 +; VI: s_and_b32 +; SI: s_and_B32 +; SI: s_or_b32 ; CI-DAG: v_lshlrev_b32_e32 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- test/CodeGen/AMDGPU/sign_extend.ll +++ test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 @@ -177,10 +177,15 @@ ret void } -; FIXME: s_bfe_i64 +; FIXME: s_bfe_i64, same on SI and VI ; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32: -; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 -; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 +; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + +; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + + ; GCN-DAG: s_sext_i32_i16 ; GCN-DAG: s_sext_i32_i16 ; GCN: s_endpgm @@ -199,8 +204,6 @@ } ; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32: -; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48 -; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s ; GCN-LABEL: {{^}}s_abs_v2i16: ; GFX9: s_load_dword [[VAL:s[0-9]+]] @@ -8,13 +8,15 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -; VI: v_sub_u32_e32 -; VI-DAG: v_sub_u32_e32 -; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI: v_add_u32_e32 -; VI: v_add_u32_e32 -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_sub_i32 +; VI: s_sub_i32 +; VI: s_max_i32 +; VI: s_max_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_and_b32 +; SI: s_or_b32 ; CI: v_sub_i32_e32 ; CI-DAG: v_sub_i32_e32 Index: test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.v2i16.ll +++ test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,12 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_v2i16: +; GFX89: {{flat|global}}_load_dword +; GFX89: {{flat|global}}_load_dword + ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -47,10 +50,15 @@ ; FIXME: VI should not scalarize arg access. ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: +; GCN: s_load_dword s +; GCN: s_load_dword s + ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; VI: v_subrev_u32_e32 -; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: s_sub_i32 +; VI: s_sub_i32 +; VI: s_lshl_b32 +; VI: s_and_b32 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -58,12 +66,15 @@ } ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: -; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX89-DAG: {{flat|global}}_load_dword + +; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -95,11 +106,10 @@ ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} -; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; VI: flat_load_ushort [[LOAD0:v[0-9]+]] -; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] +; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]] ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -114,11 +124,10 @@ ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} -; VI-NOT: v_subrev_i16 -; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}} -; VI-NOT: v_subrev_i16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_or_b32_e32 +; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]] +; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]] +; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]] define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -136,9 +145,10 @@ ; VI-NOT: v_subrev_i16 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080 -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: flat_load_dword +; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NOT: v_subrev_i16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -159,19 +169,12 @@ ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] - -; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] -; VI-NOT: and -; VI-NOT: shl -; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; VI-NOT: and -; VI-NOT: shl -; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] +; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid @@ -196,14 +199,10 @@ ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx4 -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] - -; VI: v_sub_u16_e32 -; VI: v_sub_u16_e32 - +; VI: flat_load_dword [[A:v[0-9]+]] +; VI: flat_load_dword [[B:v[0-9]+]] +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]] +; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -228,8 +227,11 @@ ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: v_sub_u16_e32 -; VI: v_sub_u16_e32 +; VI: flat_load_dword +; VI: flat_load_dword +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; VI: buffer_store_dwordx2 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll =================================================================== --- test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -1,18 +1,15 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s ; FIXME: Should still like to vectorize the memory operations for VI ; Simple 3-pair chain with loads and stores ; GCN-LABEL: @test1_as_3_3_3_v2f16( -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX89: ret define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { %i0 = load half, half addrspace(3)* %a, align 2 %i1 = load half, half addrspace(3)* %b, align 2 @@ -29,14 +26,11 @@ } ; GCN-LABEL: @test1_as_3_0_0( -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: load <2 x half>, <2 x half>* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half>* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: load <2 x half>, <2 x half>* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half>* % +; GFX89: ret define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { %i0 = load half, half addrspace(3)* %a, align 2 %i1 = load half, half* %b, align 2 @@ -53,14 +47,11 @@ } ; GCN-LABEL: @test1_as_0_0_3_v2f16( -; GFX9: load <2 x half>, <2 x half>* -; GFX9: load <2 x half>, <2 x half>* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half>* +; GFX89: load <2 x half>, <2 x half>* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX89: ret define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { %i0 = load half, half* %a, align 2 %i1 = load half, half* %b, align 2